[containers-users] Possible additions to Containers and Friends

Sat Feb 24 12:09:43 GMT 2018

Shouldn't we just standardize on bunzli's libraries (including the new 
https://github.com/dbuenzli/utext) instead of trying to re-write code 
that usually ends up being quite subtle in each standard library ?

Le 10/02/2018 à 20:53, Simon Cruanes a écrit :
> Hi Peter,
>
> Thanks for the suggestions. I'm no expert in unicode, but I do agree
> that such basic functionalities should be more easily available.
> Maybe a `Ustring` module in containers would make sense (as a private
> alias to `string`); most functionalities below would fit there, I think.
>
> Would you consider opening PRs against gen, sequence, and containers
> repositories so we can discuss that without spamming the list? I can
> help if needed, or do it myself.
> I'm interested in tests too, but will probably write some (possibly
> using Uutf as a reference); some of the tests you wrote below I can
> retrofit in the qtest mechanism.
>
> Cheers!
>
>
> Le Sat, 10 Feb 2018, peter frey wrote:
>> (*
>> Reading recent posts on discuss.ocal.org gives me the impression that some
>> tiny
>> number of utf related routines should be more easily available.
>> Container's Sequence.t and Gen.t, in particular could benefit from a couple
>> of
>> simple routines.  The code below fits well into that frame work.
>> I am treating it as public domain code but feel free to make it your own and
>> to
>> include it where appropriate. (Perhaps some of the tests could go into the
>> example directory...)
>> The routines here DO NOT verify unless its unavoidable.  In particular they
>> accept ALL code points that can be encoded by the original Utf8 definition.
>> It is only a matter of language; we could call it utf31 ...
>> Restricting the range is trivial; as would be including some verification
>> code.
>>
>>
>>
>> *)
>>
>> open Containers
>>
>> (* Create a generator from a utf8-string. Each call produces a code point.
>>   * The optional parameter srcIdx specifies then start point in the string.
>>   * srcIdx must point to a valid suffix of a utf8 string.
>>   * *)
>> let gen_of_utf8 ?(srcIdx=ref 0) str =
>>      let lim = String.length str in
>>      let assemble_next () =     (* we come here only for multi-byte
>> characters *)
>>        let cv jmax accu =      (* utf8 character length; construction of
>> uchar *)
>>          let rec cv' j accu' =          (* inner loop j = 1..jmax ; each
>> uchar *)
>>            let ch = Char.code str.[ !srcIdx + j] in
>>            let next = ( (accu' lsl 6) lor ( ch land 0x7f )) in
>>            if j = jmax then begin     (* except for 1st, each char gives 6
>> bits*)
>>              srcIdx := !srcIdx + j +1; Some next            (* +1 for 1st
>> char *)
>>            end else cv' (succ j) next
>>        in cv' 1  (* 1st char is already proccessed! *) accu
>>      in if !srcIdx >= lim then None else
>>      let n = str.[ !srcIdx ] in match n with
>>      (* 0xxxxxxx *) | '\000' .. '\127' -> incr srcIdx; Some (int_of_char n)
>>      (* 110yyyyy *) | '\128' .. '\223' -> cv 1 ((Char.code n) land 0b11111 )
>>      (* 1110zzzz *) | '\224' .. '\239' -> cv 2 ((Char.code n) land 0b1111 )
>>      (* 11110uuu *) | '\240' .. '\247' -> cv 3 ((Char.code n) land 0b111 )
>>      (* 111110vv *) | '\248' .. '\251' -> cv 4 ((Char.code n) land 0b11 )
>>      (* 1111110w *) | '\252' .. '\253' -> cv 5 ((Char.code n) land 0b1 )
>>      (* 1111111X *) | '\254' .. '\255' -> raise (Failure "Bad stream")
>>    in  assemble_next;;
>>
>>
>> (* The 'natural' stream representation of a utf-string is a generator.
>>   * But Sequences are not far away ... *)
>> let makeUtf8Seq ?(srcIdx=ref 0) str = Sequence.of_gen (gen_of_utf8 ~srcIdx
>> str)
>>
>>
>>
>> (* Convert a code point to a string; Hopefully some day this will be in the
>>   * standard library. There are various equally trivial versions of this
>> around.
>>   * The returned string is created (allocated) fresh for each k.
>>   * *)
>>
>> let code_to_string k =
>>    let mask = 0b111111 in
>>    if k < 0 || k >= 0x4000000 then begin
>>      let s = Bytes.create 6 in
>>      Bytes.unsafe_set s 0 (Char.chr (0xfc + (k lsr 30)));
>>      Bytes.unsafe_set s 1 (Char.unsafe_chr (0x80 lor ((k lsr 24) land
>> mask)));
>>      Bytes.unsafe_set s 2 (Char.unsafe_chr (0x80 lor ((k lsr 18) land
>> mask)));
>>      Bytes.unsafe_set s 3 (Char.unsafe_chr (0x80 lor ((k lsr 12) land
>> mask)));
>>      Bytes.unsafe_set s 4 (Char.unsafe_chr (0x80 lor ((k lsr 6) land mask)));
>>      Bytes.unsafe_set s 5 (Char.unsafe_chr (0x80 lor (k land mask)));
>>      s end
>>    else if k <= 0x7f then
>>      Bytes.make 1 (Char.unsafe_chr k)
>>    else if k <= 0x7ff then begin
>>      let s = Bytes.create 2 in
>>      Bytes.unsafe_set s 0 (Char.unsafe_chr (0xc0 lor (k lsr 6)));
>>      Bytes.unsafe_set s 1 (Char.unsafe_chr (0x80 lor (k land mask)));
>>      s end
>>    else if k <= 0xffff then begin
>>      let s = Bytes.create 3 in
>>      Bytes.unsafe_set s 0 (Char.unsafe_chr (0xe0 lor (k lsr 12)));
>>      Bytes.unsafe_set s 1 (Char.unsafe_chr (0x80 lor ((k lsr 6) land mask)));
>>      Bytes.unsafe_set s 2 (Char.unsafe_chr (0x80 lor (k land mask)));
>>      s end
>>    else if k <= 0x1fffff then begin
>>      let s = Bytes.create 4 in
>>      Bytes.unsafe_set s 0 (Char.unsafe_chr (0xf0 + (k lsr 18)));
>>      Bytes.unsafe_set s 1 (Char.unsafe_chr (0x80 lor ((k lsr 12) land
>> mask)));
>>      Bytes.unsafe_set s 2 (Char.unsafe_chr (0x80 lor ((k lsr 6) land mask)));
>>      Bytes.unsafe_set s 3 (Char.unsafe_chr (0x80 lor (k land mask)));
>>      s end
>>    else begin
>>      let s = Bytes.create 5 in
>>      Bytes.unsafe_set s 0 (Char.unsafe_chr (0xf8 + (k lsr 24)));
>>      Bytes.unsafe_set s 1 (Char.unsafe_chr (0x80 lor ((k lsr 18) land
>> mask)));
>>      Bytes.unsafe_set s 2 (Char.unsafe_chr (0x80 lor ((k lsr 12) land
>> mask)));
>>      Bytes.unsafe_set s 3 (Char.unsafe_chr (0x80 lor ((k lsr 6) land mask)));
>>      Bytes.unsafe_set s 4 (Char.unsafe_chr (0x80 lor (k land mask)));
>>      s end
>>
>> let string_to_code str =
>>      let cv jmax accu =      (* utf8 character length; construction of uchar
>> *)
>>      if jmax >  String.length str then raise (Failure "string_to_code")
>>      else let rec cv' j accu' =       (* inner loop j = 1..jmax ; each uchar
>> *)
>>        let ch = Char.code (String.unsafe_get str j) in
>>        let next = ( (accu' lsl 6) lor ( ch land 0x7f )) in
>>        if j = jmax then next else cv' (succ j) next
>>        in cv' 1  (* 1st char is already proccessed! *) accu
>>      in let n = str.[0] in match n with
>>      (* 0xxxxxxx *) | '\000' .. '\127' -> int_of_char n
>>      (* 110yyyyy *) | '\128' .. '\223' -> cv 1 ((Char.code n) land 0b11111 )
>>      (* 1110zzzz *) | '\224' .. '\239' -> cv 2 ((Char.code n) land 0b1111 )
>>      (* 11110uuu *) | '\240' .. '\247' -> cv 3 ((Char.code n) land 0b111 )
>>      (* 111110vv *) | '\248' .. '\251' -> cv 4 ((Char.code n) land 0b11 )
>>      (* 1111110w *) | '\252' .. '\253' -> cv 5 ((Char.code n) land 0b1 )
>>      (* 1111111X *) | '\254' .. '\255' -> raise (Failure "Bad stream")
>>
>>
>> (* code_into_string over-writes string s which must be 7-byte string.
>>   * n-byte String ends with '\000' which is set as needed (in case you feed
>> it
>>   * to a c-program). The last byte of a string contains the # of unused bytes
>> in
>>   * then string. It is set here, for example, by "Bytes.unsafe_set s 6
>> '\000'"
>>   * If the string is longer than 1 word (plus header) all hell breaks loose.
>>   * Use only if the string is copied afterwards. (Buffer.add_string ... etc)
>>   * DUBIOUS (and about twice as fast because allocation is not needed)
>>   * js_of_ocaml might be unhappy with this...
>> *)
>> let code_into_string s k =
>>    let mask = 0b111111 in
>>    if k < 0 || k >= 0x4000000 then begin
>>      Bytes.unsafe_set s 0 (Char.chr (0xfc + (k lsr 30)));
>>      Bytes.unsafe_set s 1 (Char.unsafe_chr (0x80 lor ((k lsr 24) land
>> mask)));
>>      Bytes.unsafe_set s 2 (Char.unsafe_chr (0x80 lor ((k lsr 18) land
>> mask)));
>>      Bytes.unsafe_set s 3 (Char.unsafe_chr (0x80 lor ((k lsr 12) land
>> mask)));
>>      Bytes.unsafe_set s 4 (Char.unsafe_chr (0x80 lor ((k lsr 6) land mask)));
>>      Bytes.unsafe_set s 5 (Char.unsafe_chr (0x80 lor ( k land mask)));
>>      Bytes.unsafe_set s 6 '\000';                   (* string internals s/b
>> OK *)
>>      Bytes.unsafe_set s 7 (Char.unsafe_chr  1 );   (* string internals
>> DUBIOUS *)
>>      () end
>>    else if k <= 0x7f then begin
>>      Bytes.unsafe_set s 0 (Char.chr k);
>>      Bytes.unsafe_set s 1 '\000';
>>      Bytes.unsafe_set s 7 (Char.unsafe_chr  6 );
>>      () end
>>    else if k <= 0x7ff then begin
>>      Bytes.unsafe_set s 0 (Char.unsafe_chr (0xc0 lor (k lsr 6)));
>>      Bytes.unsafe_set s 1 (Char.unsafe_chr (0x80 lor (k land mask)));
>>      Bytes.unsafe_set s 2 '\000';
>>      Bytes.unsafe_set s 7 (Char.unsafe_chr  5 );
>>      () end
>>    else if k <= 0xffff then begin
>>      Bytes.unsafe_set s 0 (Char.unsafe_chr (0xe0 lor (k lsr 12)));
>>      Bytes.unsafe_set s 1 (Char.unsafe_chr (0x80 lor ((k lsr 6) land mask)));
>>      Bytes.unsafe_set s 2 (Char.unsafe_chr (0x80 lor (k land mask)));
>>      Bytes.unsafe_set s 3 '\000';
>>      Bytes.unsafe_set s 7 (Char.unsafe_chr  4 );
>>      () end
>>    else if k <= 0x1fffff then begin
>>      Bytes.unsafe_set s 0 (Char.unsafe_chr (0xf0 + (k lsr 18)));
>>      Bytes.unsafe_set s 1 (Char.unsafe_chr (0x80 lor ((k lsr 12) land
>> mask)));
>>      Bytes.unsafe_set s 2 (Char.unsafe_chr (0x80 lor ((k lsr 6) land mask)));
>>      Bytes.unsafe_set s 3 (Char.unsafe_chr (0x80 lor (k land mask)));
>>      Bytes.unsafe_set s 4 '\000';
>>      Bytes.unsafe_set s 7 (Char.unsafe_chr  3 );
>>      () end
>>    else begin
>>      Bytes.unsafe_set s 0 (Char.unsafe_chr (0xf8 + (k lsr 24)));
>>      Bytes.unsafe_set s 1 (Char.unsafe_chr (0x80 lor ((k lsr 18) land
>> mask)));
>>      Bytes.unsafe_set s 2 (Char.unsafe_chr (0x80 lor ((k lsr 12) land
>> mask)));
>>      Bytes.unsafe_set s 3 (Char.unsafe_chr (0x80 lor ((k lsr 6) land mask)));
>>      Bytes.unsafe_set s 4 (Char.unsafe_chr (0x80 lor (k land mask)));
>>      Bytes.unsafe_set s 5 '\000';
>>      Bytes.unsafe_set s 7 (Char.unsafe_chr  2 );
>>      () end
>>
>> (* Automaton to map a (utf8) char Sequence.t into an int Sequence.t
>>     Accept up to 6 characters, converting them to an integer to feed to k.
>>     This code demonstrates that being at the receiving end of a sequence can
>>     cause hardship ... It needs often a state-machine
>>     ... and perhaps a better name  *)
>>    let mapUtf8Char2Code (k: int -> unit) =
>>      let rec next = ref first
>>      and accu = ref 0
>>      and first ch = match ch with
>>        (* 0xxxxxxx *) | '\000' .. '\127' -> k (int_of_char ch); accu := 0
>>        (* 110yyyyy *) | '\128' .. '\223' -> cv s1 ch 0b11111
>>        (* 1110zzzz *) | '\224' .. '\239' -> cv s2 ch 0b1111
>>        (* 11110uuu *) | '\240' .. '\247' -> cv s3 ch 0b111
>>        (* 111110vv *) | '\248' .. '\251' -> cv s4 ch 0b11
>>        (* 1111110w *) | '\252' .. '\253' -> cv s5 ch 0b1
>>        (* 1111111X *) | '\254' .. '\255' -> raise (Failure "Bad stream")
>>      and inline ch = accu := (!accu lsl 6) lor ((Char.code ch) land 0x7f )
>>      and s1 ch = inline ch; next := first; k !accu
>>      and s2 ch = inline ch; next := s1 and s3 ch = inline ch; next := s2
>>      and s4 ch = inline ch; next := s3 and s5 ch = inline ch; next := s4
>>      and cv startState initialValue mask =
>>          next := startState;
>>          accu := (Char.code initialValue) land mask
>>    in (fun c -> !next c) ;;
>>
>> let code_len k =
>>    if k < 0 || k >= 0x4000000 then 6 else if k <= 0x7f then 1
>>    else if k <= 0x7ff then 2 else if k <= 0xffff then 3
>>    else if k <= 0x1fffff then 4 else 5
>>
>>
>> (*
>> (*
>> ===========================================================================
>>   *           various tests of above
>> *)
>>
>>
>> (* measure.ml
>>   * included to avoid dependency on some other (more capable) measuring tool
>>   *                                *)
>> let measure fn arg (units:int)  comment =
>>    (* arg is last argument of fn (or unit not belonging to fn);
>>       argument units is only used for ratio (units /. elapsed) *)
>>    let start = Unix.gettimeofday() in
>>    let res = fn arg in
>>    let endt = Unix.gettimeofday() in let elapsed = endt -. start in
>>    let open Printf in
>>    printf"\n%s\n\tTime:%f Units:%i Units/sec:%s uSecs/Unit:%s\n%!"
>>          (sprintf"Measured <<%s>>" comment)
>>          elapsed units
>>          ( if units <= 1 then "N/A" else
>>            sprintf"%8.0f" ((float_of_int units) /. elapsed ) )
>>          ( if units <= 1 then "N/A" else
>>            sprintf"%2.8f" (elapsed /. (float_of_int units) *. 1000000.0 ) );
>>    res
>>
>>
>> open Printf
>> module S = Sequence
>> module V = Vector
>>
>> let code_to_string_test last =                              (*
>> code_to_string *)
>>    for i = 0 to last do ignore (code_to_string i)  done ;;
>>
>> let code_into_string_test last =                          (*
>> code_into_string *)
>>    let str = Bytes.create 6 in
>>    for i = 0 to last do ignore (code_into_string str i)  done ;;
>>
>> let round_about2 last =                            (* code_to_string and
>> back *)
>>    for i = 0 to last do
>>      let str = code_to_string i in (* convert code point to char string *)
>>      let j = string_to_code str in  (* convert the char string back to code
>> *)
>>      assert( i = j)
>>    done
>>
>> let round_about3 last =                          (* code_into_string and
>> back *)
>>    let len = ref 1 in
>>    let str = Bytes.create 6 in
>>    for i = 0 to last do
>>      code_into_string str i; (* convert code point to char string *)
>>      let j = string_to_code str in  (* convert the char string back to code
>> *)
>>      assert( i = j);
>>      if code_len i <> !len then (printf"New len at %x:%s\n" i str;
>>                                 len := code_len i )
>>    done
>>
>>
>> let make_big_string last = (* create a utf8 string of consecutive code
>> points *)
>>    let buf = Buffer.create (16 * 1024 * 1024 ) in (* ocaml tolerates large *)
>>    let str = Bytes.create 6 in
>>    for i = 0 to last do
>>      code_into_string str i;
>>      Buffer.add_string buf str;
>>    done;
>>    Buffer.contents buf
>>    ;;
>>
>> let make_big_string last = (* create a utf8 string of consecutive code
>> points *)
>>    let buf = Buffer.create (16 * 1024 * 1024 ) in (* ocaml tolerates large *)
>>    let str = Bytes.create 7 in
>>    for i = 0 to last do
>>      code_into_string str i;
>>      Buffer.add_string buf str;
>>    done;
>>    let b = Buffer.contents buf in
>>    printf"\n\ncreated %i strings. \nfinal length: %i bytes. \
>>           \nAvg len %f\n%!"
>>           last
>>           (String.length b)
>>           ((float_of_int (String.length b)) /. (float_of_int last));
>>    b
>>
>> let decode_big big =              (* create a utf8 string; convert to int
>> S.t *)
>>    let check = ref 0 in             (* map char S.t to int S.t and test
>> result *)
>>    (String.to_seq big)
>>    (mapUtf8Char2Code (fun j -> assert (j = !check) ; incr check))
>>
>>
>> let last1 = 0x3ffffff;; (*    67_108_863 *)
>> let last2 = 0x7fffffff;; (* 1073_741_823 *)
>> let last = last2 (* last2 uses ALL code points and takes a while *)
>>
>> (* Sample: Convert a utf8 char string to a Vector *)
>> let utf8_to_vector str = str |> gen_of_utf8 |> V.of_gen
>>
>> let _ = printf "\"SKサイトリf\" |> utf8_to_vector \n\t |> Vector.to_seq |>
>> S.map code_to_string |> fun seq -> seq (printf\"%s\t\")\n!";;
>>
>> "SKサイトリf" |> utf8_to_vector |> Vector.to_seq |> S.map code_to_string |>
>>   fun seq -> seq (printf"%s\t");;
>>
>> (* Test below does not work for last 2 because it allocates a to big string
>> *)
>> let big = measure make_big_string last1 last1 "make_big_string";;
>> measure decode_big big last1 "decode_big";;
>>
>> (* Here last2 encodes/decodes ALL possible code points *)
>> measure round_about2 last last "round_about2";;
>> measure round_about3 last last "round_about3";;
>> measure code_into_string_test last last "code_into_string";;
>> measure code_to_string_test last last "code_to_string";;
>>
>> let _ = (* One can never have to many samples ... *)
>>   printf"\n\n    "; S.( 0 -- 15) (fun i -> printf" %2x" i);
>>   S.( 0 -- 15)
>>    (fun i -> printf"\n%4x  " (i * 16 + 0x2500);
>>      S.( (0x2500 + i * 16) -- (0x250F + i * 16))
>>      |> S.map code_to_string
>>      |> S.to_list |> String.concat "  " |> print_string
>>    );;
>> *)
>>
>
>
>
> _______________________________________________
> Containers-users mailing list
> Containers-users at lists.ocaml.org
> http://lists.ocaml.org/listinfo/containers-users

-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.ocaml.org/pipermail/containers-users/attachments/20180224/217e393a/attachment-0001.html>