<html>

  <head>

    <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">

  </head>

  <body text="#000000" bgcolor="#FFFFFF">

    <p>Shouldn't we just standardize on bunzli's libraries (including

      the new <a class="moz-txt-link-freetext" href="https://github.com/dbuenzli/utext">https://github.com/dbuenzli/utext</a>) instead of trying to

      re-write code that usually ends up being quite subtle in each

      standard library ?<br>

    </p>

    <br>

    <div class="moz-cite-prefix">Le 10/02/2018 à 20:53, Simon Cruanes a

      écrit :<br>

    </div>

    <blockquote type="cite" cite="mid:20180210195351.GE1388@carty">

      <pre wrap="">Hi Peter,

Thanks for the suggestions. I'm no expert in unicode, but I do agree

that such basic functionalities should be more easily available.

Maybe a `Ustring` module in containers would make sense (as a private

alias to `string`); most functionalities below would fit there, I think.

Would you consider opening PRs against gen, sequence, and containers

repositories so we can discuss that without spamming the list? I can

help if needed, or do it myself.

I'm interested in tests too, but will probably write some (possibly

using Uutf as a reference); some of the tests you wrote below I can

retrofit in the qtest mechanism.

Cheers!

Le Sat, 10 Feb 2018, peter frey wrote:

</pre>

      <blockquote type="cite">

        <pre wrap="">(*

Reading recent posts on discuss.ocal.org gives me the impression that some

tiny

number of utf related routines should be more easily available.

Container's Sequence.t and Gen.t, in particular could benefit from a couple

of

simple routines.  The code below fits well into that frame work.

I am treating it as public domain code but feel free to make it your own and

to

include it where appropriate. (Perhaps some of the tests could go into the

example directory...)

The routines here DO NOT verify unless its unavoidable.  In particular they

accept ALL code points that can be encoded by the original Utf8 definition.

It is only a matter of language; we could call it utf31 ...

Restricting the range is trivial; as would be including some verification

code.

*)

open Containers

(* Create a generator from a utf8-string. Each call produces a code point.

 * The optional parameter srcIdx specifies then start point in the string.

 * srcIdx must point to a valid suffix of a utf8 string.

 * *)

let gen_of_utf8 ?(srcIdx=ref 0) str =

    let lim = String.length str in

    let assemble_next () =     (* we come here only for multi-byte

characters *)

      let cv jmax accu =      (* utf8 character length; construction of

uchar *)

        let rec cv' j accu' =          (* inner loop j = 1..jmax ; each

uchar *)

          let ch = Char.code str.[ !srcIdx + j] in

          let next = ( (accu' lsl 6) lor ( ch land 0x7f )) in

          if j = jmax then begin     (* except for 1st, each char gives 6

bits*)

            srcIdx := !srcIdx + j +1; Some next            (* +1 for 1st

char *)

          end else cv' (succ j) next

      in cv' 1  (* 1st char is already proccessed! *) accu

    in if !srcIdx >= lim then None else

    let n = str.[ !srcIdx ] in match n with

    (* 0xxxxxxx *) | '\000' .. '\127' -> incr srcIdx; Some (int_of_char n)

    (* 110yyyyy *) | '\128' .. '\223' -> cv 1 ((Char.code n) land 0b11111 )

    (* 1110zzzz *) | '\224' .. '\239' -> cv 2 ((Char.code n) land 0b1111 )

    (* 11110uuu *) | '\240' .. '\247' -> cv 3 ((Char.code n) land 0b111 )

    (* 111110vv *) | '\248' .. '\251' -> cv 4 ((Char.code n) land 0b11 )

    (* 1111110w *) | '\252' .. '\253' -> cv 5 ((Char.code n) land 0b1 )

    (* 1111111X *) | '\254' .. '\255' -> raise (Failure "Bad stream")

  in  assemble_next;;

(* The 'natural' stream representation of a utf-string is a generator.

 * But Sequences are not far away ... *)

let makeUtf8Seq ?(srcIdx=ref 0) str = Sequence.of_gen (gen_of_utf8 ~srcIdx

str)

(* Convert a code point to a string; Hopefully some day this will be in the

 * standard library. There are various equally trivial versions of this

around.

 * The returned string is created (allocated) fresh for each k.

 * *)

let code_to_string k =

  let mask = 0b111111 in

  if k < 0 || k >= 0x4000000 then begin

    let s = Bytes.create 6 in

    Bytes.unsafe_set s 0 (Char.chr (0xfc + (k lsr 30)));

    Bytes.unsafe_set s 1 (Char.unsafe_chr (0x80 lor ((k lsr 24) land

mask)));

    Bytes.unsafe_set s 2 (Char.unsafe_chr (0x80 lor ((k lsr 18) land

mask)));

    Bytes.unsafe_set s 3 (Char.unsafe_chr (0x80 lor ((k lsr 12) land

mask)));

    Bytes.unsafe_set s 4 (Char.unsafe_chr (0x80 lor ((k lsr 6) land mask)));

    Bytes.unsafe_set s 5 (Char.unsafe_chr (0x80 lor (k land mask)));

    s end

  else if k <= 0x7f then

    Bytes.make 1 (Char.unsafe_chr k)

  else if k <= 0x7ff then begin

    let s = Bytes.create 2 in

    Bytes.unsafe_set s 0 (Char.unsafe_chr (0xc0 lor (k lsr 6)));

    Bytes.unsafe_set s 1 (Char.unsafe_chr (0x80 lor (k land mask)));

    s end

  else if k <= 0xffff then begin

    let s = Bytes.create 3 in

    Bytes.unsafe_set s 0 (Char.unsafe_chr (0xe0 lor (k lsr 12)));

    Bytes.unsafe_set s 1 (Char.unsafe_chr (0x80 lor ((k lsr 6) land mask)));

    Bytes.unsafe_set s 2 (Char.unsafe_chr (0x80 lor (k land mask)));

    s end

  else if k <= 0x1fffff then begin

    let s = Bytes.create 4 in

    Bytes.unsafe_set s 0 (Char.unsafe_chr (0xf0 + (k lsr 18)));

    Bytes.unsafe_set s 1 (Char.unsafe_chr (0x80 lor ((k lsr 12) land

mask)));

    Bytes.unsafe_set s 2 (Char.unsafe_chr (0x80 lor ((k lsr 6) land mask)));

    Bytes.unsafe_set s 3 (Char.unsafe_chr (0x80 lor (k land mask)));

    s end

  else begin

    let s = Bytes.create 5 in

    Bytes.unsafe_set s 0 (Char.unsafe_chr (0xf8 + (k lsr 24)));

    Bytes.unsafe_set s 1 (Char.unsafe_chr (0x80 lor ((k lsr 18) land

mask)));

    Bytes.unsafe_set s 2 (Char.unsafe_chr (0x80 lor ((k lsr 12) land

mask)));

    Bytes.unsafe_set s 3 (Char.unsafe_chr (0x80 lor ((k lsr 6) land mask)));

    Bytes.unsafe_set s 4 (Char.unsafe_chr (0x80 lor (k land mask)));

    s end

let string_to_code str =

    let cv jmax accu =      (* utf8 character length; construction of uchar

*)

    if jmax >  String.length str then raise (Failure "string_to_code")

    else let rec cv' j accu' =       (* inner loop j = 1..jmax ; each uchar

*)

      let ch = Char.code (String.unsafe_get str j) in

      let next = ( (accu' lsl 6) lor ( ch land 0x7f )) in

      if j = jmax then next else cv' (succ j) next

      in cv' 1  (* 1st char is already proccessed! *) accu

    in let n = str.[0] in match n with

    (* 0xxxxxxx *) | '\000' .. '\127' -> int_of_char n

    (* 110yyyyy *) | '\128' .. '\223' -> cv 1 ((Char.code n) land 0b11111 )

    (* 1110zzzz *) | '\224' .. '\239' -> cv 2 ((Char.code n) land 0b1111 )

    (* 11110uuu *) | '\240' .. '\247' -> cv 3 ((Char.code n) land 0b111 )

    (* 111110vv *) | '\248' .. '\251' -> cv 4 ((Char.code n) land 0b11 )

    (* 1111110w *) | '\252' .. '\253' -> cv 5 ((Char.code n) land 0b1 )

    (* 1111111X *) | '\254' .. '\255' -> raise (Failure "Bad stream")

(* code_into_string over-writes string s which must be 7-byte string.

 * n-byte String ends with '\000' which is set as needed (in case you feed

it

 * to a c-program). The last byte of a string contains the # of unused bytes

in

 * then string. It is set here, for example, by "Bytes.unsafe_set s 6

'\000'"

 * If the string is longer than 1 word (plus header) all hell breaks loose.

 * Use only if the string is copied afterwards. (Buffer.add_string ... etc)

 * DUBIOUS (and about twice as fast because allocation is not needed)

 * js_of_ocaml might be unhappy with this...

*)

let code_into_string s k =

  let mask = 0b111111 in

  if k < 0 || k >= 0x4000000 then begin

    Bytes.unsafe_set s 0 (Char.chr (0xfc + (k lsr 30)));

    Bytes.unsafe_set s 1 (Char.unsafe_chr (0x80 lor ((k lsr 24) land

mask)));

    Bytes.unsafe_set s 2 (Char.unsafe_chr (0x80 lor ((k lsr 18) land

mask)));

    Bytes.unsafe_set s 3 (Char.unsafe_chr (0x80 lor ((k lsr 12) land

mask)));

    Bytes.unsafe_set s 4 (Char.unsafe_chr (0x80 lor ((k lsr 6) land mask)));

    Bytes.unsafe_set s 5 (Char.unsafe_chr (0x80 lor ( k land mask)));

    Bytes.unsafe_set s 6 '\000';                   (* string internals s/b

OK *)

    Bytes.unsafe_set s 7 (Char.unsafe_chr  1 );   (* string internals

DUBIOUS *)

    () end

  else if k <= 0x7f then begin

    Bytes.unsafe_set s 0 (Char.chr k);

    Bytes.unsafe_set s 1 '\000';

    Bytes.unsafe_set s 7 (Char.unsafe_chr  6 );

    () end

  else if k <= 0x7ff then begin

    Bytes.unsafe_set s 0 (Char.unsafe_chr (0xc0 lor (k lsr 6)));

    Bytes.unsafe_set s 1 (Char.unsafe_chr (0x80 lor (k land mask)));

    Bytes.unsafe_set s 2 '\000';

    Bytes.unsafe_set s 7 (Char.unsafe_chr  5 );

    () end

  else if k <= 0xffff then begin

    Bytes.unsafe_set s 0 (Char.unsafe_chr (0xe0 lor (k lsr 12)));

    Bytes.unsafe_set s 1 (Char.unsafe_chr (0x80 lor ((k lsr 6) land mask)));

    Bytes.unsafe_set s 2 (Char.unsafe_chr (0x80 lor (k land mask)));

    Bytes.unsafe_set s 3 '\000';

    Bytes.unsafe_set s 7 (Char.unsafe_chr  4 );

    () end

  else if k <= 0x1fffff then begin

    Bytes.unsafe_set s 0 (Char.unsafe_chr (0xf0 + (k lsr 18)));

    Bytes.unsafe_set s 1 (Char.unsafe_chr (0x80 lor ((k lsr 12) land

mask)));

    Bytes.unsafe_set s 2 (Char.unsafe_chr (0x80 lor ((k lsr 6) land mask)));

    Bytes.unsafe_set s 3 (Char.unsafe_chr (0x80 lor (k land mask)));

    Bytes.unsafe_set s 4 '\000';

    Bytes.unsafe_set s 7 (Char.unsafe_chr  3 );

    () end

  else begin

    Bytes.unsafe_set s 0 (Char.unsafe_chr (0xf8 + (k lsr 24)));

    Bytes.unsafe_set s 1 (Char.unsafe_chr (0x80 lor ((k lsr 18) land

mask)));

    Bytes.unsafe_set s 2 (Char.unsafe_chr (0x80 lor ((k lsr 12) land

mask)));

    Bytes.unsafe_set s 3 (Char.unsafe_chr (0x80 lor ((k lsr 6) land mask)));

    Bytes.unsafe_set s 4 (Char.unsafe_chr (0x80 lor (k land mask)));

    Bytes.unsafe_set s 5 '\000';

    Bytes.unsafe_set s 7 (Char.unsafe_chr  2 );

    () end

(* Automaton to map a (utf8) char Sequence.t into an int Sequence.t

   Accept up to 6 characters, converting them to an integer to feed to k.

   This code demonstrates that being at the receiving end of a sequence can

   cause hardship ... It needs often a state-machine

   ... and perhaps a better name  *)

  let mapUtf8Char2Code (k: int -> unit) =

    let rec next = ref first

    and accu = ref 0

    and first ch = match ch with

      (* 0xxxxxxx *) | '\000' .. '\127' -> k (int_of_char ch); accu := 0

      (* 110yyyyy *) | '\128' .. '\223' -> cv s1 ch 0b11111

      (* 1110zzzz *) | '\224' .. '\239' -> cv s2 ch 0b1111

      (* 11110uuu *) | '\240' .. '\247' -> cv s3 ch 0b111

      (* 111110vv *) | '\248' .. '\251' -> cv s4 ch 0b11

      (* 1111110w *) | '\252' .. '\253' -> cv s5 ch 0b1

      (* 1111111X *) | '\254' .. '\255' -> raise (Failure "Bad stream")

    and inline ch = accu := (!accu lsl 6) lor ((Char.code ch) land 0x7f )

    and s1 ch = inline ch; next := first; k !accu

    and s2 ch = inline ch; next := s1 and s3 ch = inline ch; next := s2

    and s4 ch = inline ch; next := s3 and s5 ch = inline ch; next := s4

    and cv startState initialValue mask =

        next := startState;

        accu := (Char.code initialValue) land mask

  in (fun c -> !next c) ;;

let code_len k =

  if k < 0 || k >= 0x4000000 then 6 else if k <= 0x7f then 1

  else if k <= 0x7ff then 2 else if k <= 0xffff then 3

  else if k <= 0x1fffff then 4 else 5

(*

(*

===========================================================================

 *           various tests of above

*)

(* measure.ml

 * included to avoid dependency on some other (more capable) measuring tool

 *                                *)

let measure fn arg (units:int)  comment =

  (* arg is last argument of fn (or unit not belonging to fn);

     argument units is only used for ratio (units /. elapsed) *)

  let start = Unix.gettimeofday() in

  let res = fn arg in

  let endt = Unix.gettimeofday() in let elapsed = endt -. start in

  let open Printf in

  printf"\n%s\n\tTime:%f Units:%i Units/sec:%s uSecs/Unit:%s\n%!"

        (sprintf"Measured <<%s>>" comment)

        elapsed units

        ( if units <= 1 then "N/A" else

          sprintf"%8.0f" ((float_of_int units) /. elapsed ) )

        ( if units <= 1 then "N/A" else

          sprintf"%2.8f" (elapsed /. (float_of_int units) *. 1000000.0 ) );

  res

open Printf

module S = Sequence

module V = Vector

let code_to_string_test last =                              (*

code_to_string *)

  for i = 0 to last do ignore (code_to_string i)  done ;;

let code_into_string_test last =                          (*

code_into_string *)

  let str = Bytes.create 6 in

  for i = 0 to last do ignore (code_into_string str i)  done ;;

let round_about2 last =                            (* code_to_string and

back *)

  for i = 0 to last do

    let str = code_to_string i in (* convert code point to char string *)

    let j = string_to_code str in  (* convert the char string back to code

*)

    assert( i = j)

  done

let round_about3 last =                          (* code_into_string and

back *)

  let len = ref 1 in

  let str = Bytes.create 6 in

  for i = 0 to last do

    code_into_string str i; (* convert code point to char string *)

    let j = string_to_code str in  (* convert the char string back to code

*)

    assert( i = j);

    if code_len i <> !len then (printf"New len at %x:%s\n" i str;

                               len := code_len i )

  done

let make_big_string last = (* create a utf8 string of consecutive code

points *)

  let buf = Buffer.create (16 * 1024 * 1024 ) in (* ocaml tolerates large *)

  let str = Bytes.create 6 in

  for i = 0 to last do

    code_into_string str i;

    Buffer.add_string buf str;

  done;

  Buffer.contents buf

  ;;

let make_big_string last = (* create a utf8 string of consecutive code

points *)

  let buf = Buffer.create (16 * 1024 * 1024 ) in (* ocaml tolerates large *)

  let str = Bytes.create 7 in

  for i = 0 to last do

    code_into_string str i;

    Buffer.add_string buf str;

  done;

  let b = Buffer.contents buf in

  printf"\n\ncreated %i strings. \nfinal length: %i bytes. \

         \nAvg len %f\n%!"

         last

         (String.length b)

         ((float_of_int (String.length b)) /. (float_of_int last));

  b

let decode_big big =              (* create a utf8 string; convert to int

S.t *)

  let check = ref 0 in             (* map char S.t to int S.t and test

result *)

  (String.to_seq big)

  (mapUtf8Char2Code (fun j -> assert (j = !check) ; incr check))

let last1 = 0x3ffffff;; (*    67_108_863 *)

let last2 = 0x7fffffff;; (* 1073_741_823 *)

let last = last2 (* last2 uses ALL code points and takes a while *)

(* Sample: Convert a utf8 char string to a Vector *)

let utf8_to_vector str = str |> gen_of_utf8 |> V.of_gen

let _ = printf "\"SKサイトリf\" |> utf8_to_vector \n\t |> Vector.to_seq |>

S.map code_to_string |> fun seq -> seq (printf\"%s\t\")\n!";;

"SKサイトリf" |> utf8_to_vector |> Vector.to_seq |> S.map code_to_string |>

 fun seq -> seq (printf"%s\t");;

(* Test below does not work for last 2 because it allocates a to big string

*)

let big = measure make_big_string last1 last1 "make_big_string";;

measure decode_big big last1 "decode_big";;

(* Here last2 encodes/decodes ALL possible code points *)

measure round_about2 last last "round_about2";;

measure round_about3 last last "round_about3";;

measure code_into_string_test last last "code_into_string";;

measure code_to_string_test last last "code_to_string";;

let _ = (* One can never have to many samples ... *)

 printf"\n\n    "; S.( 0 -- 15) (fun i -> printf" %2x" i);

 S.( 0 -- 15)

  (fun i -> printf"\n%4x  " (i * 16 + 0x2500);

    S.( (0x2500 + i * 16) -- (0x250F + i * 16))

    |> S.map code_to_string

    |> S.to_list |> String.concat "  " |> print_string

  );;

*)

</pre>

      </blockquote>

      <pre wrap="">

</pre>

      <br>

      <fieldset class="mimeAttachmentHeader"></fieldset>

      <br>

      <pre wrap="">_______________________________________________

Containers-users mailing list

<a class="moz-txt-link-abbreviated" href="mailto:Containers-users@lists.ocaml.org">Containers-users@lists.ocaml.org</a>

<a class="moz-txt-link-freetext" href="http://lists.ocaml.org/listinfo/containers-users">http://lists.ocaml.org/listinfo/containers-users</a>

</pre>

    </blockquote>

    <br>

  </body>

</html>