[containers-users] Possible additions to Containers and Friends
Drup
drupyog at zoho.com
Sat Feb 24 12:09:43 GMT 2018
Shouldn't we just standardize on bunzli's libraries (including the new
https://github.com/dbuenzli/utext) instead of trying to re-write code
that usually ends up being quite subtle in each standard library ?
Le 10/02/2018 à 20:53, Simon Cruanes a écrit :
> Hi Peter,
>
> Thanks for the suggestions. I'm no expert in unicode, but I do agree
> that such basic functionalities should be more easily available.
> Maybe a `Ustring` module in containers would make sense (as a private
> alias to `string`); most functionalities below would fit there, I think.
>
> Would you consider opening PRs against gen, sequence, and containers
> repositories so we can discuss that without spamming the list? I can
> help if needed, or do it myself.
> I'm interested in tests too, but will probably write some (possibly
> using Uutf as a reference); some of the tests you wrote below I can
> retrofit in the qtest mechanism.
>
> Cheers!
>
>
> Le Sat, 10 Feb 2018, peter frey wrote:
>> (*
>> Reading recent posts on discuss.ocal.org gives me the impression that some
>> tiny
>> number of utf related routines should be more easily available.
>> Container's Sequence.t and Gen.t, in particular could benefit from a couple
>> of
>> simple routines. The code below fits well into that frame work.
>> I am treating it as public domain code but feel free to make it your own and
>> to
>> include it where appropriate. (Perhaps some of the tests could go into the
>> example directory...)
>> The routines here DO NOT verify unless its unavoidable. In particular they
>> accept ALL code points that can be encoded by the original Utf8 definition.
>> It is only a matter of language; we could call it utf31 ...
>> Restricting the range is trivial; as would be including some verification
>> code.
>>
>>
>>
>> *)
>>
>> open Containers
>>
>> (* Create a generator from a utf8-string. Each call produces a code point.
>> * The optional parameter srcIdx specifies then start point in the string.
>> * srcIdx must point to a valid suffix of a utf8 string.
>> * *)
>> let gen_of_utf8 ?(srcIdx=ref 0) str =
>> let lim = String.length str in
>> let assemble_next () = (* we come here only for multi-byte
>> characters *)
>> let cv jmax accu = (* utf8 character length; construction of
>> uchar *)
>> let rec cv' j accu' = (* inner loop j = 1..jmax ; each
>> uchar *)
>> let ch = Char.code str.[ !srcIdx + j] in
>> let next = ( (accu' lsl 6) lor ( ch land 0x7f )) in
>> if j = jmax then begin (* except for 1st, each char gives 6
>> bits*)
>> srcIdx := !srcIdx + j +1; Some next (* +1 for 1st
>> char *)
>> end else cv' (succ j) next
>> in cv' 1 (* 1st char is already proccessed! *) accu
>> in if !srcIdx >= lim then None else
>> let n = str.[ !srcIdx ] in match n with
>> (* 0xxxxxxx *) | '\000' .. '\127' -> incr srcIdx; Some (int_of_char n)
>> (* 110yyyyy *) | '\128' .. '\223' -> cv 1 ((Char.code n) land 0b11111 )
>> (* 1110zzzz *) | '\224' .. '\239' -> cv 2 ((Char.code n) land 0b1111 )
>> (* 11110uuu *) | '\240' .. '\247' -> cv 3 ((Char.code n) land 0b111 )
>> (* 111110vv *) | '\248' .. '\251' -> cv 4 ((Char.code n) land 0b11 )
>> (* 1111110w *) | '\252' .. '\253' -> cv 5 ((Char.code n) land 0b1 )
>> (* 1111111X *) | '\254' .. '\255' -> raise (Failure "Bad stream")
>> in assemble_next;;
>>
>>
>> (* The 'natural' stream representation of a utf-string is a generator.
>> * But Sequences are not far away ... *)
>> let makeUtf8Seq ?(srcIdx=ref 0) str = Sequence.of_gen (gen_of_utf8 ~srcIdx
>> str)
>>
>>
>>
>> (* Convert a code point to a string; Hopefully some day this will be in the
>> * standard library. There are various equally trivial versions of this
>> around.
>> * The returned string is created (allocated) fresh for each k.
>> * *)
>>
>> let code_to_string k =
>> let mask = 0b111111 in
>> if k < 0 || k >= 0x4000000 then begin
>> let s = Bytes.create 6 in
>> Bytes.unsafe_set s 0 (Char.chr (0xfc + (k lsr 30)));
>> Bytes.unsafe_set s 1 (Char.unsafe_chr (0x80 lor ((k lsr 24) land
>> mask)));
>> Bytes.unsafe_set s 2 (Char.unsafe_chr (0x80 lor ((k lsr 18) land
>> mask)));
>> Bytes.unsafe_set s 3 (Char.unsafe_chr (0x80 lor ((k lsr 12) land
>> mask)));
>> Bytes.unsafe_set s 4 (Char.unsafe_chr (0x80 lor ((k lsr 6) land mask)));
>> Bytes.unsafe_set s 5 (Char.unsafe_chr (0x80 lor (k land mask)));
>> s end
>> else if k <= 0x7f then
>> Bytes.make 1 (Char.unsafe_chr k)
>> else if k <= 0x7ff then begin
>> let s = Bytes.create 2 in
>> Bytes.unsafe_set s 0 (Char.unsafe_chr (0xc0 lor (k lsr 6)));
>> Bytes.unsafe_set s 1 (Char.unsafe_chr (0x80 lor (k land mask)));
>> s end
>> else if k <= 0xffff then begin
>> let s = Bytes.create 3 in
>> Bytes.unsafe_set s 0 (Char.unsafe_chr (0xe0 lor (k lsr 12)));
>> Bytes.unsafe_set s 1 (Char.unsafe_chr (0x80 lor ((k lsr 6) land mask)));
>> Bytes.unsafe_set s 2 (Char.unsafe_chr (0x80 lor (k land mask)));
>> s end
>> else if k <= 0x1fffff then begin
>> let s = Bytes.create 4 in
>> Bytes.unsafe_set s 0 (Char.unsafe_chr (0xf0 + (k lsr 18)));
>> Bytes.unsafe_set s 1 (Char.unsafe_chr (0x80 lor ((k lsr 12) land
>> mask)));
>> Bytes.unsafe_set s 2 (Char.unsafe_chr (0x80 lor ((k lsr 6) land mask)));
>> Bytes.unsafe_set s 3 (Char.unsafe_chr (0x80 lor (k land mask)));
>> s end
>> else begin
>> let s = Bytes.create 5 in
>> Bytes.unsafe_set s 0 (Char.unsafe_chr (0xf8 + (k lsr 24)));
>> Bytes.unsafe_set s 1 (Char.unsafe_chr (0x80 lor ((k lsr 18) land
>> mask)));
>> Bytes.unsafe_set s 2 (Char.unsafe_chr (0x80 lor ((k lsr 12) land
>> mask)));
>> Bytes.unsafe_set s 3 (Char.unsafe_chr (0x80 lor ((k lsr 6) land mask)));
>> Bytes.unsafe_set s 4 (Char.unsafe_chr (0x80 lor (k land mask)));
>> s end
>>
>> let string_to_code str =
>> let cv jmax accu = (* utf8 character length; construction of uchar
>> *)
>> if jmax > String.length str then raise (Failure "string_to_code")
>> else let rec cv' j accu' = (* inner loop j = 1..jmax ; each uchar
>> *)
>> let ch = Char.code (String.unsafe_get str j) in
>> let next = ( (accu' lsl 6) lor ( ch land 0x7f )) in
>> if j = jmax then next else cv' (succ j) next
>> in cv' 1 (* 1st char is already proccessed! *) accu
>> in let n = str.[0] in match n with
>> (* 0xxxxxxx *) | '\000' .. '\127' -> int_of_char n
>> (* 110yyyyy *) | '\128' .. '\223' -> cv 1 ((Char.code n) land 0b11111 )
>> (* 1110zzzz *) | '\224' .. '\239' -> cv 2 ((Char.code n) land 0b1111 )
>> (* 11110uuu *) | '\240' .. '\247' -> cv 3 ((Char.code n) land 0b111 )
>> (* 111110vv *) | '\248' .. '\251' -> cv 4 ((Char.code n) land 0b11 )
>> (* 1111110w *) | '\252' .. '\253' -> cv 5 ((Char.code n) land 0b1 )
>> (* 1111111X *) | '\254' .. '\255' -> raise (Failure "Bad stream")
>>
>>
>> (* code_into_string over-writes string s which must be 7-byte string.
>> * n-byte String ends with '\000' which is set as needed (in case you feed
>> it
>> * to a c-program). The last byte of a string contains the # of unused bytes
>> in
>> * then string. It is set here, for example, by "Bytes.unsafe_set s 6
>> '\000'"
>> * If the string is longer than 1 word (plus header) all hell breaks loose.
>> * Use only if the string is copied afterwards. (Buffer.add_string ... etc)
>> * DUBIOUS (and about twice as fast because allocation is not needed)
>> * js_of_ocaml might be unhappy with this...
>> *)
>> let code_into_string s k =
>> let mask = 0b111111 in
>> if k < 0 || k >= 0x4000000 then begin
>> Bytes.unsafe_set s 0 (Char.chr (0xfc + (k lsr 30)));
>> Bytes.unsafe_set s 1 (Char.unsafe_chr (0x80 lor ((k lsr 24) land
>> mask)));
>> Bytes.unsafe_set s 2 (Char.unsafe_chr (0x80 lor ((k lsr 18) land
>> mask)));
>> Bytes.unsafe_set s 3 (Char.unsafe_chr (0x80 lor ((k lsr 12) land
>> mask)));
>> Bytes.unsafe_set s 4 (Char.unsafe_chr (0x80 lor ((k lsr 6) land mask)));
>> Bytes.unsafe_set s 5 (Char.unsafe_chr (0x80 lor ( k land mask)));
>> Bytes.unsafe_set s 6 '\000'; (* string internals s/b
>> OK *)
>> Bytes.unsafe_set s 7 (Char.unsafe_chr 1 ); (* string internals
>> DUBIOUS *)
>> () end
>> else if k <= 0x7f then begin
>> Bytes.unsafe_set s 0 (Char.chr k);
>> Bytes.unsafe_set s 1 '\000';
>> Bytes.unsafe_set s 7 (Char.unsafe_chr 6 );
>> () end
>> else if k <= 0x7ff then begin
>> Bytes.unsafe_set s 0 (Char.unsafe_chr (0xc0 lor (k lsr 6)));
>> Bytes.unsafe_set s 1 (Char.unsafe_chr (0x80 lor (k land mask)));
>> Bytes.unsafe_set s 2 '\000';
>> Bytes.unsafe_set s 7 (Char.unsafe_chr 5 );
>> () end
>> else if k <= 0xffff then begin
>> Bytes.unsafe_set s 0 (Char.unsafe_chr (0xe0 lor (k lsr 12)));
>> Bytes.unsafe_set s 1 (Char.unsafe_chr (0x80 lor ((k lsr 6) land mask)));
>> Bytes.unsafe_set s 2 (Char.unsafe_chr (0x80 lor (k land mask)));
>> Bytes.unsafe_set s 3 '\000';
>> Bytes.unsafe_set s 7 (Char.unsafe_chr 4 );
>> () end
>> else if k <= 0x1fffff then begin
>> Bytes.unsafe_set s 0 (Char.unsafe_chr (0xf0 + (k lsr 18)));
>> Bytes.unsafe_set s 1 (Char.unsafe_chr (0x80 lor ((k lsr 12) land
>> mask)));
>> Bytes.unsafe_set s 2 (Char.unsafe_chr (0x80 lor ((k lsr 6) land mask)));
>> Bytes.unsafe_set s 3 (Char.unsafe_chr (0x80 lor (k land mask)));
>> Bytes.unsafe_set s 4 '\000';
>> Bytes.unsafe_set s 7 (Char.unsafe_chr 3 );
>> () end
>> else begin
>> Bytes.unsafe_set s 0 (Char.unsafe_chr (0xf8 + (k lsr 24)));
>> Bytes.unsafe_set s 1 (Char.unsafe_chr (0x80 lor ((k lsr 18) land
>> mask)));
>> Bytes.unsafe_set s 2 (Char.unsafe_chr (0x80 lor ((k lsr 12) land
>> mask)));
>> Bytes.unsafe_set s 3 (Char.unsafe_chr (0x80 lor ((k lsr 6) land mask)));
>> Bytes.unsafe_set s 4 (Char.unsafe_chr (0x80 lor (k land mask)));
>> Bytes.unsafe_set s 5 '\000';
>> Bytes.unsafe_set s 7 (Char.unsafe_chr 2 );
>> () end
>>
>> (* Automaton to map a (utf8) char Sequence.t into an int Sequence.t
>> Accept up to 6 characters, converting them to an integer to feed to k.
>> This code demonstrates that being at the receiving end of a sequence can
>> cause hardship ... It needs often a state-machine
>> ... and perhaps a better name *)
>> let mapUtf8Char2Code (k: int -> unit) =
>> let rec next = ref first
>> and accu = ref 0
>> and first ch = match ch with
>> (* 0xxxxxxx *) | '\000' .. '\127' -> k (int_of_char ch); accu := 0
>> (* 110yyyyy *) | '\128' .. '\223' -> cv s1 ch 0b11111
>> (* 1110zzzz *) | '\224' .. '\239' -> cv s2 ch 0b1111
>> (* 11110uuu *) | '\240' .. '\247' -> cv s3 ch 0b111
>> (* 111110vv *) | '\248' .. '\251' -> cv s4 ch 0b11
>> (* 1111110w *) | '\252' .. '\253' -> cv s5 ch 0b1
>> (* 1111111X *) | '\254' .. '\255' -> raise (Failure "Bad stream")
>> and inline ch = accu := (!accu lsl 6) lor ((Char.code ch) land 0x7f )
>> and s1 ch = inline ch; next := first; k !accu
>> and s2 ch = inline ch; next := s1 and s3 ch = inline ch; next := s2
>> and s4 ch = inline ch; next := s3 and s5 ch = inline ch; next := s4
>> and cv startState initialValue mask =
>> next := startState;
>> accu := (Char.code initialValue) land mask
>> in (fun c -> !next c) ;;
>>
>> let code_len k =
>> if k < 0 || k >= 0x4000000 then 6 else if k <= 0x7f then 1
>> else if k <= 0x7ff then 2 else if k <= 0xffff then 3
>> else if k <= 0x1fffff then 4 else 5
>>
>>
>> (*
>> (*
>> ===========================================================================
>> * various tests of above
>> *)
>>
>>
>> (* measure.ml
>> * included to avoid dependency on some other (more capable) measuring tool
>> * *)
>> let measure fn arg (units:int) comment =
>> (* arg is last argument of fn (or unit not belonging to fn);
>> argument units is only used for ratio (units /. elapsed) *)
>> let start = Unix.gettimeofday() in
>> let res = fn arg in
>> let endt = Unix.gettimeofday() in let elapsed = endt -. start in
>> let open Printf in
>> printf"\n%s\n\tTime:%f Units:%i Units/sec:%s uSecs/Unit:%s\n%!"
>> (sprintf"Measured <<%s>>" comment)
>> elapsed units
>> ( if units <= 1 then "N/A" else
>> sprintf"%8.0f" ((float_of_int units) /. elapsed ) )
>> ( if units <= 1 then "N/A" else
>> sprintf"%2.8f" (elapsed /. (float_of_int units) *. 1000000.0 ) );
>> res
>>
>>
>> open Printf
>> module S = Sequence
>> module V = Vector
>>
>> let code_to_string_test last = (*
>> code_to_string *)
>> for i = 0 to last do ignore (code_to_string i) done ;;
>>
>> let code_into_string_test last = (*
>> code_into_string *)
>> let str = Bytes.create 6 in
>> for i = 0 to last do ignore (code_into_string str i) done ;;
>>
>> let round_about2 last = (* code_to_string and
>> back *)
>> for i = 0 to last do
>> let str = code_to_string i in (* convert code point to char string *)
>> let j = string_to_code str in (* convert the char string back to code
>> *)
>> assert( i = j)
>> done
>>
>> let round_about3 last = (* code_into_string and
>> back *)
>> let len = ref 1 in
>> let str = Bytes.create 6 in
>> for i = 0 to last do
>> code_into_string str i; (* convert code point to char string *)
>> let j = string_to_code str in (* convert the char string back to code
>> *)
>> assert( i = j);
>> if code_len i <> !len then (printf"New len at %x:%s\n" i str;
>> len := code_len i )
>> done
>>
>>
>> let make_big_string last = (* create a utf8 string of consecutive code
>> points *)
>> let buf = Buffer.create (16 * 1024 * 1024 ) in (* ocaml tolerates large *)
>> let str = Bytes.create 6 in
>> for i = 0 to last do
>> code_into_string str i;
>> Buffer.add_string buf str;
>> done;
>> Buffer.contents buf
>> ;;
>>
>> let make_big_string last = (* create a utf8 string of consecutive code
>> points *)
>> let buf = Buffer.create (16 * 1024 * 1024 ) in (* ocaml tolerates large *)
>> let str = Bytes.create 7 in
>> for i = 0 to last do
>> code_into_string str i;
>> Buffer.add_string buf str;
>> done;
>> let b = Buffer.contents buf in
>> printf"\n\ncreated %i strings. \nfinal length: %i bytes. \
>> \nAvg len %f\n%!"
>> last
>> (String.length b)
>> ((float_of_int (String.length b)) /. (float_of_int last));
>> b
>>
>> let decode_big big = (* create a utf8 string; convert to int
>> S.t *)
>> let check = ref 0 in (* map char S.t to int S.t and test
>> result *)
>> (String.to_seq big)
>> (mapUtf8Char2Code (fun j -> assert (j = !check) ; incr check))
>>
>>
>> let last1 = 0x3ffffff;; (* 67_108_863 *)
>> let last2 = 0x7fffffff;; (* 1073_741_823 *)
>> let last = last2 (* last2 uses ALL code points and takes a while *)
>>
>> (* Sample: Convert a utf8 char string to a Vector *)
>> let utf8_to_vector str = str |> gen_of_utf8 |> V.of_gen
>>
>> let _ = printf "\"SKサイトリf\" |> utf8_to_vector \n\t |> Vector.to_seq |>
>> S.map code_to_string |> fun seq -> seq (printf\"%s\t\")\n!";;
>>
>> "SKサイトリf" |> utf8_to_vector |> Vector.to_seq |> S.map code_to_string |>
>> fun seq -> seq (printf"%s\t");;
>>
>> (* Test below does not work for last 2 because it allocates a to big string
>> *)
>> let big = measure make_big_string last1 last1 "make_big_string";;
>> measure decode_big big last1 "decode_big";;
>>
>> (* Here last2 encodes/decodes ALL possible code points *)
>> measure round_about2 last last "round_about2";;
>> measure round_about3 last last "round_about3";;
>> measure code_into_string_test last last "code_into_string";;
>> measure code_to_string_test last last "code_to_string";;
>>
>> let _ = (* One can never have to many samples ... *)
>> printf"\n\n "; S.( 0 -- 15) (fun i -> printf" %2x" i);
>> S.( 0 -- 15)
>> (fun i -> printf"\n%4x " (i * 16 + 0x2500);
>> S.( (0x2500 + i * 16) -- (0x250F + i * 16))
>> |> S.map code_to_string
>> |> S.to_list |> String.concat " " |> print_string
>> );;
>> *)
>>
>
>
>
> _______________________________________________
> Containers-users mailing list
> Containers-users at lists.ocaml.org
> http://lists.ocaml.org/listinfo/containers-users
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.ocaml.org/pipermail/containers-users/attachments/20180224/217e393a/attachment-0001.html>
More information about the Containers-users
mailing list