[containers-users] Possible additions to Containers and Friends

peter frey pjfrey at sympatico.ca
Sat Feb 10 19:38:13 GMT 2018


(*
Reading recent posts on discuss.ocal.org gives me the impression that 
some tiny
number of utf related routines should be more easily available.
Container's Sequence.t and Gen.t, in particular could benefit from a 
couple of
simple routines.  The code below fits well into that frame work.
I am treating it as public domain code but feel free to make it your own 
and to
include it where appropriate. (Perhaps some of the tests could go into the
example directory...)
The routines here DO NOT verify unless its unavoidable.  In particular they
accept ALL code points that can be encoded by the original Utf8 definition.
It is only a matter of language; we could call it utf31 ...
Restricting the range is trivial; as would be including some 
verification code.



*)

open Containers

(* Create a generator from a utf8-string. Each call produces a code point.
  * The optional parameter srcIdx specifies then start point in the string.
  * srcIdx must point to a valid suffix of a utf8 string.
  * *)
let gen_of_utf8 ?(srcIdx=ref 0) str =
     let lim = String.length str in
     let assemble_next () =     (* we come here only for multi-byte 
characters *)
       let cv jmax accu =      (* utf8 character length; construction of 
uchar *)
         let rec cv' j accu' =          (* inner loop j = 1..jmax ; each 
uchar *)
           let ch = Char.code str.[ !srcIdx + j] in
           let next = ( (accu' lsl 6) lor ( ch land 0x7f )) in
           if j = jmax then begin     (* except for 1st, each char gives 
6 bits*)
             srcIdx := !srcIdx + j +1; Some next            (* +1 for 
1st char *)
           end else cv' (succ j) next
       in cv' 1  (* 1st char is already proccessed! *) accu
     in if !srcIdx >= lim then None else
     let n = str.[ !srcIdx ] in match n with
     (* 0xxxxxxx *) | '\000' .. '\127' -> incr srcIdx; Some (int_of_char n)
     (* 110yyyyy *) | '\128' .. '\223' -> cv 1 ((Char.code n) land 0b11111 )
     (* 1110zzzz *) | '\224' .. '\239' -> cv 2 ((Char.code n) land 0b1111 )
     (* 11110uuu *) | '\240' .. '\247' -> cv 3 ((Char.code n) land 0b111 )
     (* 111110vv *) | '\248' .. '\251' -> cv 4 ((Char.code n) land 0b11 )
     (* 1111110w *) | '\252' .. '\253' -> cv 5 ((Char.code n) land 0b1 )
     (* 1111111X *) | '\254' .. '\255' -> raise (Failure "Bad stream")
   in  assemble_next;;


(* The 'natural' stream representation of a utf-string is a generator.
  * But Sequences are not far away ... *)
let makeUtf8Seq ?(srcIdx=ref 0) str = Sequence.of_gen (gen_of_utf8 
~srcIdx str)



(* Convert a code point to a string; Hopefully some day this will be in the
  * standard library. There are various equally trivial versions of this 
around.
  * The returned string is created (allocated) fresh for each k.
  * *)

let code_to_string k =
   let mask = 0b111111 in
   if k < 0 || k >= 0x4000000 then begin
     let s = Bytes.create 6 in
     Bytes.unsafe_set s 0 (Char.chr (0xfc + (k lsr 30)));
     Bytes.unsafe_set s 1 (Char.unsafe_chr (0x80 lor ((k lsr 24) land 
mask)));
     Bytes.unsafe_set s 2 (Char.unsafe_chr (0x80 lor ((k lsr 18) land 
mask)));
     Bytes.unsafe_set s 3 (Char.unsafe_chr (0x80 lor ((k lsr 12) land 
mask)));
     Bytes.unsafe_set s 4 (Char.unsafe_chr (0x80 lor ((k lsr 6) land 
mask)));
     Bytes.unsafe_set s 5 (Char.unsafe_chr (0x80 lor (k land mask)));
     s end
   else if k <= 0x7f then
     Bytes.make 1 (Char.unsafe_chr k)
   else if k <= 0x7ff then begin
     let s = Bytes.create 2 in
     Bytes.unsafe_set s 0 (Char.unsafe_chr (0xc0 lor (k lsr 6)));
     Bytes.unsafe_set s 1 (Char.unsafe_chr (0x80 lor (k land mask)));
     s end
   else if k <= 0xffff then begin
     let s = Bytes.create 3 in
     Bytes.unsafe_set s 0 (Char.unsafe_chr (0xe0 lor (k lsr 12)));
     Bytes.unsafe_set s 1 (Char.unsafe_chr (0x80 lor ((k lsr 6) land 
mask)));
     Bytes.unsafe_set s 2 (Char.unsafe_chr (0x80 lor (k land mask)));
     s end
   else if k <= 0x1fffff then begin
     let s = Bytes.create 4 in
     Bytes.unsafe_set s 0 (Char.unsafe_chr (0xf0 + (k lsr 18)));
     Bytes.unsafe_set s 1 (Char.unsafe_chr (0x80 lor ((k lsr 12) land 
mask)));
     Bytes.unsafe_set s 2 (Char.unsafe_chr (0x80 lor ((k lsr 6) land 
mask)));
     Bytes.unsafe_set s 3 (Char.unsafe_chr (0x80 lor (k land mask)));
     s end
   else begin
     let s = Bytes.create 5 in
     Bytes.unsafe_set s 0 (Char.unsafe_chr (0xf8 + (k lsr 24)));
     Bytes.unsafe_set s 1 (Char.unsafe_chr (0x80 lor ((k lsr 18) land 
mask)));
     Bytes.unsafe_set s 2 (Char.unsafe_chr (0x80 lor ((k lsr 12) land 
mask)));
     Bytes.unsafe_set s 3 (Char.unsafe_chr (0x80 lor ((k lsr 6) land 
mask)));
     Bytes.unsafe_set s 4 (Char.unsafe_chr (0x80 lor (k land mask)));
     s end

let string_to_code str =
     let cv jmax accu =      (* utf8 character length; construction of 
uchar *)
     if jmax >  String.length str then raise (Failure "string_to_code")
     else let rec cv' j accu' =       (* inner loop j = 1..jmax ; each 
uchar *)
       let ch = Char.code (String.unsafe_get str j) in
       let next = ( (accu' lsl 6) lor ( ch land 0x7f )) in
       if j = jmax then next else cv' (succ j) next
       in cv' 1  (* 1st char is already proccessed! *) accu
     in let n = str.[0] in match n with
     (* 0xxxxxxx *) | '\000' .. '\127' -> int_of_char n
     (* 110yyyyy *) | '\128' .. '\223' -> cv 1 ((Char.code n) land 0b11111 )
     (* 1110zzzz *) | '\224' .. '\239' -> cv 2 ((Char.code n) land 0b1111 )
     (* 11110uuu *) | '\240' .. '\247' -> cv 3 ((Char.code n) land 0b111 )
     (* 111110vv *) | '\248' .. '\251' -> cv 4 ((Char.code n) land 0b11 )
     (* 1111110w *) | '\252' .. '\253' -> cv 5 ((Char.code n) land 0b1 )
     (* 1111111X *) | '\254' .. '\255' -> raise (Failure "Bad stream")


(* code_into_string over-writes string s which must be 7-byte string.
  * n-byte String ends with '\000' which is set as needed (in case you 
feed it
  * to a c-program). The last byte of a string contains the # of unused 
bytes in
  * then string. It is set here, for example, by "Bytes.unsafe_set s 6 
'\000'"
  * If the string is longer than 1 word (plus header) all hell breaks loose.
  * Use only if the string is copied afterwards. (Buffer.add_string ... etc)
  * DUBIOUS (and about twice as fast because allocation is not needed)
  * js_of_ocaml might be unhappy with this...
*)
let code_into_string s k =
   let mask = 0b111111 in
   if k < 0 || k >= 0x4000000 then begin
     Bytes.unsafe_set s 0 (Char.chr (0xfc + (k lsr 30)));
     Bytes.unsafe_set s 1 (Char.unsafe_chr (0x80 lor ((k lsr 24) land 
mask)));
     Bytes.unsafe_set s 2 (Char.unsafe_chr (0x80 lor ((k lsr 18) land 
mask)));
     Bytes.unsafe_set s 3 (Char.unsafe_chr (0x80 lor ((k lsr 12) land 
mask)));
     Bytes.unsafe_set s 4 (Char.unsafe_chr (0x80 lor ((k lsr 6) land 
mask)));
     Bytes.unsafe_set s 5 (Char.unsafe_chr (0x80 lor ( k land mask)));
     Bytes.unsafe_set s 6 '\000';                   (* string internals 
s/b OK *)
     Bytes.unsafe_set s 7 (Char.unsafe_chr  1 );   (* string internals 
DUBIOUS *)
     () end
   else if k <= 0x7f then begin
     Bytes.unsafe_set s 0 (Char.chr k);
     Bytes.unsafe_set s 1 '\000';
     Bytes.unsafe_set s 7 (Char.unsafe_chr  6 );
     () end
   else if k <= 0x7ff then begin
     Bytes.unsafe_set s 0 (Char.unsafe_chr (0xc0 lor (k lsr 6)));
     Bytes.unsafe_set s 1 (Char.unsafe_chr (0x80 lor (k land mask)));
     Bytes.unsafe_set s 2 '\000';
     Bytes.unsafe_set s 7 (Char.unsafe_chr  5 );
     () end
   else if k <= 0xffff then begin
     Bytes.unsafe_set s 0 (Char.unsafe_chr (0xe0 lor (k lsr 12)));
     Bytes.unsafe_set s 1 (Char.unsafe_chr (0x80 lor ((k lsr 6) land 
mask)));
     Bytes.unsafe_set s 2 (Char.unsafe_chr (0x80 lor (k land mask)));
     Bytes.unsafe_set s 3 '\000';
     Bytes.unsafe_set s 7 (Char.unsafe_chr  4 );
     () end
   else if k <= 0x1fffff then begin
     Bytes.unsafe_set s 0 (Char.unsafe_chr (0xf0 + (k lsr 18)));
     Bytes.unsafe_set s 1 (Char.unsafe_chr (0x80 lor ((k lsr 12) land 
mask)));
     Bytes.unsafe_set s 2 (Char.unsafe_chr (0x80 lor ((k lsr 6) land 
mask)));
     Bytes.unsafe_set s 3 (Char.unsafe_chr (0x80 lor (k land mask)));
     Bytes.unsafe_set s 4 '\000';
     Bytes.unsafe_set s 7 (Char.unsafe_chr  3 );
     () end
   else begin
     Bytes.unsafe_set s 0 (Char.unsafe_chr (0xf8 + (k lsr 24)));
     Bytes.unsafe_set s 1 (Char.unsafe_chr (0x80 lor ((k lsr 18) land 
mask)));
     Bytes.unsafe_set s 2 (Char.unsafe_chr (0x80 lor ((k lsr 12) land 
mask)));
     Bytes.unsafe_set s 3 (Char.unsafe_chr (0x80 lor ((k lsr 6) land 
mask)));
     Bytes.unsafe_set s 4 (Char.unsafe_chr (0x80 lor (k land mask)));
     Bytes.unsafe_set s 5 '\000';
     Bytes.unsafe_set s 7 (Char.unsafe_chr  2 );
     () end

(* Automaton to map a (utf8) char Sequence.t into an int Sequence.t
    Accept up to 6 characters, converting them to an integer to feed to k.
    This code demonstrates that being at the receiving end of a sequence can
    cause hardship ... It needs often a state-machine
    ... and perhaps a better name  *)
   let mapUtf8Char2Code (k: int -> unit) =
     let rec next = ref first
     and accu = ref 0
     and first ch = match ch with
       (* 0xxxxxxx *) | '\000' .. '\127' -> k (int_of_char ch); accu := 0
       (* 110yyyyy *) | '\128' .. '\223' -> cv s1 ch 0b11111
       (* 1110zzzz *) | '\224' .. '\239' -> cv s2 ch 0b1111
       (* 11110uuu *) | '\240' .. '\247' -> cv s3 ch 0b111
       (* 111110vv *) | '\248' .. '\251' -> cv s4 ch 0b11
       (* 1111110w *) | '\252' .. '\253' -> cv s5 ch 0b1
       (* 1111111X *) | '\254' .. '\255' -> raise (Failure "Bad stream")
     and inline ch = accu := (!accu lsl 6) lor ((Char.code ch) land 0x7f )
     and s1 ch = inline ch; next := first; k !accu
     and s2 ch = inline ch; next := s1 and s3 ch = inline ch; next := s2
     and s4 ch = inline ch; next := s3 and s5 ch = inline ch; next := s4
     and cv startState initialValue mask =
         next := startState;
         accu := (Char.code initialValue) land mask
   in (fun c -> !next c) ;;

let code_len k =
   if k < 0 || k >= 0x4000000 then 6 else if k <= 0x7f then 1
   else if k <= 0x7ff then 2 else if k <= 0xffff then 3
   else if k <= 0x1fffff then 4 else 5


(*
(* 
===========================================================================
  *           various tests of above
*)


(* measure.ml
  * included to avoid dependency on some other (more capable) measuring tool
  *                                *)
let measure fn arg (units:int)  comment =
   (* arg is last argument of fn (or unit not belonging to fn);
      argument units is only used for ratio (units /. elapsed) *)
   let start = Unix.gettimeofday() in
   let res = fn arg in
   let endt = Unix.gettimeofday() in let elapsed = endt -. start in
   let open Printf in
   printf"\n%s\n\tTime:%f Units:%i Units/sec:%s uSecs/Unit:%s\n%!"
         (sprintf"Measured <<%s>>" comment)
         elapsed units
         ( if units <= 1 then "N/A" else
           sprintf"%8.0f" ((float_of_int units) /. elapsed ) )
         ( if units <= 1 then "N/A" else
           sprintf"%2.8f" (elapsed /. (float_of_int units) *. 1000000.0 ) );
   res


open Printf
module S = Sequence
module V = Vector

let code_to_string_test last =                              (* 
code_to_string *)
   for i = 0 to last do ignore (code_to_string i)  done ;;

let code_into_string_test last =                          (* 
code_into_string *)
   let str = Bytes.create 6 in
   for i = 0 to last do ignore (code_into_string str i)  done ;;

let round_about2 last =                            (* code_to_string and 
back *)
   for i = 0 to last do
     let str = code_to_string i in (* convert code point to char string *)
     let j = string_to_code str in  (* convert the char string back to 
code *)
     assert( i = j)
   done

let round_about3 last =                          (* code_into_string and 
back *)
   let len = ref 1 in
   let str = Bytes.create 6 in
   for i = 0 to last do
     code_into_string str i; (* convert code point to char string *)
     let j = string_to_code str in  (* convert the char string back to 
code *)
     assert( i = j);
     if code_len i <> !len then (printf"New len at %x:%s\n" i str;
                                len := code_len i )
   done


let make_big_string last = (* create a utf8 string of consecutive code 
points *)
   let buf = Buffer.create (16 * 1024 * 1024 ) in (* ocaml tolerates 
large *)
   let str = Bytes.create 6 in
   for i = 0 to last do
     code_into_string str i;
     Buffer.add_string buf str;
   done;
   Buffer.contents buf
   ;;

let make_big_string last = (* create a utf8 string of consecutive code 
points *)
   let buf = Buffer.create (16 * 1024 * 1024 ) in (* ocaml tolerates 
large *)
   let str = Bytes.create 7 in
   for i = 0 to last do
     code_into_string str i;
     Buffer.add_string buf str;
   done;
   let b = Buffer.contents buf in
   printf"\n\ncreated %i strings. \nfinal length: %i bytes. \
          \nAvg len %f\n%!"
          last
          (String.length b)
          ((float_of_int (String.length b)) /. (float_of_int last));
   b

let decode_big big =              (* create a utf8 string; convert to 
int S.t *)
   let check = ref 0 in             (* map char S.t to int S.t and test 
result *)
   (String.to_seq big)
   (mapUtf8Char2Code (fun j -> assert (j = !check) ; incr check))


let last1 = 0x3ffffff;; (*    67_108_863 *)
let last2 = 0x7fffffff;; (* 1073_741_823 *)
let last = last2 (* last2 uses ALL code points and takes a while *)

(* Sample: Convert a utf8 char string to a Vector *)
let utf8_to_vector str = str |> gen_of_utf8 |> V.of_gen

let _ = printf "\"SKサイトリf\" |> utf8_to_vector \n\t |> Vector.to_seq |> 
S.map code_to_string |> fun seq -> seq (printf\"%s\t\")\n!";;

"SKサイトリf" |> utf8_to_vector |> Vector.to_seq |> S.map code_to_string |>
  fun seq -> seq (printf"%s\t");;

(* Test below does not work for last 2 because it allocates a to big 
string *)
let big = measure make_big_string last1 last1 "make_big_string";;
measure decode_big big last1 "decode_big";;

(* Here last2 encodes/decodes ALL possible code points *)
measure round_about2 last last "round_about2";;
measure round_about3 last last "round_about3";;
measure code_into_string_test last last "code_into_string";;
measure code_to_string_test last last "code_to_string";;

let _ = (* One can never have to many samples ... *)
  printf"\n\n    "; S.( 0 -- 15) (fun i -> printf" %2x" i);
  S.( 0 -- 15)
   (fun i -> printf"\n%4x  " (i * 16 + 0x2500);
     S.( (0x2500 + i * 16) -- (0x250F + i * 16))
     |> S.map code_to_string
     |> S.to_list |> String.concat "  " |> print_string
   );;
*)




More information about the Containers-users mailing list