ocaml/stdlib/buffer.ml

439 lines
15 KiB
OCaml

(**************************************************************************)
(* *)
(* OCaml *)
(* *)
(* Pierre Weis and Xavier Leroy, projet Cristal, INRIA Rocquencourt *)
(* *)
(* Copyright 1999 Institut National de Recherche en Informatique et *)
(* en Automatique. *)
(* *)
(* All rights reserved. This file is distributed under the terms of *)
(* the GNU Lesser General Public License version 2.1, with the *)
(* special exception on linking described in the file LICENSE. *)
(* *)
(**************************************************************************)
(* Extensible buffers *)
type t =
{mutable buffer : bytes;
mutable position : int;
mutable length : int;
initial_buffer : bytes}
(* Invariants: all parts of the code preserve the invariants that:
- [0 <= b.position <= b.length]
- [b.length = Bytes.length b.buffer]
Note in particular that [b.position = b.length] is legal,
it means that the buffer is full and will have to be extended
before any further addition. *)
let create n =
let n = if n < 1 then 1 else n in
let n = if n > Sys.max_string_length then Sys.max_string_length else n in
let s = Bytes.create n in
{buffer = s; position = 0; length = n; initial_buffer = s}
let contents b = Bytes.sub_string b.buffer 0 b.position
let to_bytes b = Bytes.sub b.buffer 0 b.position
let sub b ofs len =
if ofs < 0 || len < 0 || ofs > b.position - len
then invalid_arg "Buffer.sub"
else Bytes.sub_string b.buffer ofs len
let blit src srcoff dst dstoff len =
if len < 0 || srcoff < 0 || srcoff > src.position - len
|| dstoff < 0 || dstoff > (Bytes.length dst) - len
then invalid_arg "Buffer.blit"
else
Bytes.unsafe_blit src.buffer srcoff dst dstoff len
let nth b ofs =
if ofs < 0 || ofs >= b.position then
invalid_arg "Buffer.nth"
else Bytes.unsafe_get b.buffer ofs
let length b = b.position
let clear b = b.position <- 0
let reset b =
b.position <- 0;
b.buffer <- b.initial_buffer;
b.length <- Bytes.length b.buffer
(* [resize b more] ensures that [b.position + more <= b.length] holds
by dynamically extending [b.buffer] if necessary -- and thus
increasing [b.length].
In particular, after [resize b more] is called, a direct access of
size [more] at [b.position] will always be in-bounds, so that
(unsafe_{get,set}) may be used for performance.
*)
let resize b more =
let old_pos = b.position in
let old_len = b.length in
let new_len = ref old_len in
while old_pos + more > !new_len do new_len := 2 * !new_len done;
if !new_len > Sys.max_string_length then begin
if old_pos + more <= Sys.max_string_length
then new_len := Sys.max_string_length
else failwith "Buffer.add: cannot grow buffer"
end;
let new_buffer = Bytes.create !new_len in
(* PR#6148: let's keep using [blit] rather than [unsafe_blit] in
this tricky function that is slow anyway. *)
Bytes.blit b.buffer 0 new_buffer 0 b.position;
b.buffer <- new_buffer;
b.length <- !new_len;
assert (b.position + more <= b.length);
assert (old_pos + more <= b.length);
()
(* Note: there are various situations (preemptive threads, signals and
gc finalizers) where OCaml code may be run asynchronously; in
particular, there may be a race with another user of [b], changing
its mutable fields in the middle of the [resize] call. The Buffer
module does not provide any correctness guarantee if that happens,
but we must still ensure that the datastructure invariants hold for
memory-safety -- as we plan to use [unsafe_{get,set}].
There are two potential allocation points in this function,
[ref] and [Bytes.create], but all reads and writes to the fields
of [b] happen before both of them or after both of them.
We therefore assume that [b.position] may change at these allocations,
and check that the [b.position + more <= b.length] postcondition
holds for both values of [b.position], before or after the function
is called. More precisely, the following invariants must hold if the
function returns correctly, in addition to the usual buffer invariants:
- [old(b.position) + more <= new(b.length)]
- [new(b.position) + more <= new(b.length)]
- [old(b.length) <= new(b.length)]
Note: [b.position + more <= old(b.length)] does *not*
hold in general, as it is precisely the case where you need
to call [resize] to increase [b.length].
Note: [assert] above does not mean that we know the conditions
always hold, but that the function may return correctly
only if they hold.
Note: the other functions in this module does not need
to be checked with this level of scrutiny, given that they
read/write the buffer immediately after checking that
[b.position + more <= b.length] hold or calling [resize].
*)
let add_char b c =
let pos = b.position in
if pos >= b.length then resize b 1;
Bytes.unsafe_set b.buffer pos c;
b.position <- pos + 1
let add_utf_8_uchar b u = match Uchar.to_int u with
| u when u < 0 -> assert false
| u when u <= 0x007F ->
add_char b (Char.unsafe_chr u)
| u when u <= 0x07FF ->
let pos = b.position in
if pos + 2 > b.length then resize b 2;
Bytes.unsafe_set b.buffer (pos )
(Char.unsafe_chr (0xC0 lor (u lsr 6)));
Bytes.unsafe_set b.buffer (pos + 1)
(Char.unsafe_chr (0x80 lor (u land 0x3F)));
b.position <- pos + 2
| u when u <= 0xFFFF ->
let pos = b.position in
if pos + 3 > b.length then resize b 3;
Bytes.unsafe_set b.buffer (pos )
(Char.unsafe_chr (0xE0 lor (u lsr 12)));
Bytes.unsafe_set b.buffer (pos + 1)
(Char.unsafe_chr (0x80 lor ((u lsr 6) land 0x3F)));
Bytes.unsafe_set b.buffer (pos + 2)
(Char.unsafe_chr (0x80 lor (u land 0x3F)));
b.position <- pos + 3
| u when u <= 0x10FFFF ->
let pos = b.position in
if pos + 4 > b.length then resize b 4;
Bytes.unsafe_set b.buffer (pos )
(Char.unsafe_chr (0xF0 lor (u lsr 18)));
Bytes.unsafe_set b.buffer (pos + 1)
(Char.unsafe_chr (0x80 lor ((u lsr 12) land 0x3F)));
Bytes.unsafe_set b.buffer (pos + 2)
(Char.unsafe_chr (0x80 lor ((u lsr 6) land 0x3F)));
Bytes.unsafe_set b.buffer (pos + 3)
(Char.unsafe_chr (0x80 lor (u land 0x3F)));
b.position <- pos + 4
| _ -> assert false
let add_utf_16be_uchar b u = match Uchar.to_int u with
| u when u < 0 -> assert false
| u when u <= 0xFFFF ->
let pos = b.position in
if pos + 2 > b.length then resize b 2;
Bytes.unsafe_set b.buffer (pos ) (Char.unsafe_chr (u lsr 8));
Bytes.unsafe_set b.buffer (pos + 1) (Char.unsafe_chr (u land 0xFF));
b.position <- pos + 2
| u when u <= 0x10FFFF ->
let u' = u - 0x10000 in
let hi = 0xD800 lor (u' lsr 10) in
let lo = 0xDC00 lor (u' land 0x3FF) in
let pos = b.position in
if pos + 4 > b.length then resize b 4;
Bytes.unsafe_set b.buffer (pos ) (Char.unsafe_chr (hi lsr 8));
Bytes.unsafe_set b.buffer (pos + 1) (Char.unsafe_chr (hi land 0xFF));
Bytes.unsafe_set b.buffer (pos + 2) (Char.unsafe_chr (lo lsr 8));
Bytes.unsafe_set b.buffer (pos + 3) (Char.unsafe_chr (lo land 0xFF));
b.position <- pos + 4
| _ -> assert false
let add_utf_16le_uchar b u = match Uchar.to_int u with
| u when u < 0 -> assert false
| u when u <= 0xFFFF ->
let pos = b.position in
if pos + 2 > b.length then resize b 2;
Bytes.unsafe_set b.buffer (pos ) (Char.unsafe_chr (u land 0xFF));
Bytes.unsafe_set b.buffer (pos + 1) (Char.unsafe_chr (u lsr 8));
b.position <- pos + 2
| u when u <= 0x10FFFF ->
let u' = u - 0x10000 in
let hi = 0xD800 lor (u' lsr 10) in
let lo = 0xDC00 lor (u' land 0x3FF) in
let pos = b.position in
if pos + 4 > b.length then resize b 4;
Bytes.unsafe_set b.buffer (pos ) (Char.unsafe_chr (hi land 0xFF));
Bytes.unsafe_set b.buffer (pos + 1) (Char.unsafe_chr (hi lsr 8));
Bytes.unsafe_set b.buffer (pos + 2) (Char.unsafe_chr (lo land 0xFF));
Bytes.unsafe_set b.buffer (pos + 3) (Char.unsafe_chr (lo lsr 8));
b.position <- pos + 4
| _ -> assert false
let add_substring b s offset len =
if offset < 0 || len < 0 || offset > String.length s - len
then invalid_arg "Buffer.add_substring/add_subbytes";
let new_position = b.position + len in
if new_position > b.length then resize b len;
Bytes.unsafe_blit_string s offset b.buffer b.position len;
b.position <- new_position
let add_subbytes b s offset len =
add_substring b (Bytes.unsafe_to_string s) offset len
let add_string b s =
let len = String.length s in
let new_position = b.position + len in
if new_position > b.length then resize b len;
Bytes.unsafe_blit_string s 0 b.buffer b.position len;
b.position <- new_position
let add_bytes b s = add_string b (Bytes.unsafe_to_string s)
let add_buffer b bs =
add_subbytes b bs.buffer 0 bs.position
(* this (private) function could move into the standard library *)
let really_input_up_to ic buf ofs len =
let rec loop ic buf ~already_read ~ofs ~to_read =
if to_read = 0 then already_read
else begin
let r = input ic buf ofs to_read in
if r = 0 then already_read
else begin
let already_read = already_read + r in
let ofs = ofs + r in
let to_read = to_read - r in
loop ic buf ~already_read ~ofs ~to_read
end
end
in loop ic buf ~already_read:0 ~ofs ~to_read:len
let unsafe_add_channel_up_to b ic len =
if b.position + len > b.length then resize b len;
let n = really_input_up_to ic b.buffer b.position len in
(* The assertion below may fail in weird scenario where
threaded/finalizer code, run asynchronously during the
[really_input_up_to] call, races on the buffer; we don't ensure
correctness in this case, but need to preserve the invariants for
memory-safety (see discussion of [resize]). *)
assert (b.position + n <= b.length);
b.position <- b.position + n;
n
let add_channel b ic len =
if len < 0 || len > Sys.max_string_length then (* PR#5004 *)
invalid_arg "Buffer.add_channel";
let n = unsafe_add_channel_up_to b ic len in
(* It is intentional that a consumer catching End_of_file
will see the data written (see #6719, #7136). *)
if n < len then raise End_of_file;
()
let output_buffer oc b =
output oc b.buffer 0 b.position
let closing = function
| '(' -> ')'
| '{' -> '}'
| _ -> assert false
(* opening and closing: open and close characters, typically ( and )
k: balance of opening and closing chars
s: the string where we are searching
start: the index where we start the search. *)
let advance_to_closing opening closing k s start =
let rec advance k i lim =
if i >= lim then raise Not_found else
if s.[i] = opening then advance (k + 1) (i + 1) lim else
if s.[i] = closing then
if k = 0 then i else advance (k - 1) (i + 1) lim
else advance k (i + 1) lim in
advance k start (String.length s)
let advance_to_non_alpha s start =
let rec advance i lim =
if i >= lim then lim else
match s.[i] with
| 'a' .. 'z' | 'A' .. 'Z' | '0' .. '9' | '_' -> advance (i + 1) lim
| _ -> i in
advance start (String.length s)
(* We are just at the beginning of an ident in s, starting at start. *)
let find_ident s start lim =
if start >= lim then raise Not_found else
match s.[start] with
(* Parenthesized ident ? *)
| '(' | '{' as c ->
let new_start = start + 1 in
let stop = advance_to_closing c (closing c) 0 s new_start in
String.sub s new_start (stop - start - 1), stop + 1
(* Regular ident *)
| _ ->
let stop = advance_to_non_alpha s (start + 1) in
String.sub s start (stop - start), stop
(* Substitute $ident, $(ident), or ${ident} in s,
according to the function mapping f. *)
let add_substitute b f s =
let lim = String.length s in
let rec subst previous i =
if i < lim then begin
match s.[i] with
| '$' as current when previous = '\\' ->
add_char b current;
subst ' ' (i + 1)
| '$' ->
let j = i + 1 in
let ident, next_i = find_ident s j lim in
add_string b (f ident);
subst ' ' next_i
| current when previous == '\\' ->
add_char b '\\';
add_char b current;
subst ' ' (i + 1)
| '\\' as current ->
subst current (i + 1)
| current ->
add_char b current;
subst current (i + 1)
end else
if previous = '\\' then add_char b previous in
subst ' ' 0
let truncate b len =
if len < 0 || len > length b then
invalid_arg "Buffer.truncate"
else
b.position <- len
(** {1 Iterators} *)
let to_seq b =
let rec aux i () =
(* Note that b.position is not a constant and cannot be lifted out of aux *)
if i >= b.position then Seq.Nil
else
let x = Bytes.unsafe_get b.buffer i in
Seq.Cons (x, aux (i+1))
in
aux 0
let to_seqi b =
let rec aux i () =
(* Note that b.position is not a constant and cannot be lifted out of aux *)
if i >= b.position then Seq.Nil
else
let x = Bytes.unsafe_get b.buffer i in
Seq.Cons ((i,x), aux (i+1))
in
aux 0
let add_seq b seq = Seq.iter (add_char b) seq
let of_seq i =
let b = create 32 in
add_seq b i;
b
(** {6 Binary encoding of integers} *)
external unsafe_set_int8 : bytes -> int -> int -> unit = "%bytes_unsafe_set"
external unsafe_set_int16 : bytes -> int -> int -> unit = "%caml_bytes_set16u"
external unsafe_set_int32 : bytes -> int -> int32 -> unit = "%caml_bytes_set32u"
external unsafe_set_int64 : bytes -> int -> int64 -> unit = "%caml_bytes_set64u"
external swap16 : int -> int = "%bswap16"
external swap32 : int32 -> int32 = "%bswap_int32"
external swap64 : int64 -> int64 = "%bswap_int64"
let add_int8 b x =
let new_position = b.position + 1 in
if new_position > b.length then resize b 1;
unsafe_set_int8 b.buffer b.position x;
b.position <- new_position
let add_int16_ne b x =
let new_position = b.position + 2 in
if new_position > b.length then resize b 2;
unsafe_set_int16 b.buffer b.position x;
b.position <- new_position
let add_int32_ne b x =
let new_position = b.position + 4 in
if new_position > b.length then resize b 4;
unsafe_set_int32 b.buffer b.position x;
b.position <- new_position
let add_int64_ne b x =
let new_position = b.position + 8 in
if new_position > b.length then resize b 8;
unsafe_set_int64 b.buffer b.position x;
b.position <- new_position
let add_int16_le b x =
add_int16_ne b (if Sys.big_endian then swap16 x else x)
let add_int16_be b x =
add_int16_ne b (if Sys.big_endian then x else swap16 x)
let add_int32_le b x =
add_int32_ne b (if Sys.big_endian then swap32 x else x)
let add_int32_be b x =
add_int32_ne b (if Sys.big_endian then x else swap32 x)
let add_int64_le b x =
add_int64_ne b (if Sys.big_endian then swap64 x else x)
let add_int64_be b x =
add_int64_ne b (if Sys.big_endian then x else swap64 x)
let add_uint8 = add_int8
let add_uint16_ne = add_int16_ne
let add_uint16_le = add_int16_le
let add_uint16_be = add_int16_be