There is very fast line feed search, using proper x86_64 SSE assembly, checking by 16 bytes per loop iteration, in our SynCommons.pas:
function BufferLineLength(Text, TextEnd: PUTF8Char): PtrInt;
{$ifdef CPUX64}
{$ifdef FPC} nostackframe; assembler; asm {$else} asm .noframe {$endif}
{$ifdef MSWINDOWS} // Win64 ABI to System-V ABI
push rsi
push rdi
mov rdi, rcx
mov rsi, rdx
{$endif}mov r8, rsi
sub r8, rdi // rdi=Text, rsi=TextEnd, r8=TextLen
jz @fail
mov ecx, edi
movdqa xmm0, [rip + @for10]
movdqa xmm1, [rip + @for13]
and rdi, -16 // check first aligned 16 bytes
and ecx, 15 // lower 4 bits indicate misalignment
movdqa xmm2, [rdi]
movdqa xmm3, xmm2
pcmpeqb xmm2, xmm0
pcmpeqb xmm3, xmm1
por xmm3, xmm2
pmovmskb eax, xmm3
shr eax, cl // shift out unaligned bytes
test eax, eax
jz @main
bsf eax, eax
add rax, rcx
add rax, rdi
sub rax, rsi
jae @fail // don't exceed TextEnd
add rax, r8 // rax = TextFound - TextEnd + (TextEnd - Text) = offset
{$ifdef MSWINDOWS}
pop rdi
pop rsi
{$endif}ret
@main: add rdi, 16
sub rdi, rsi
jae @fail
jmp @by16
{$ifdef FPC} align 16 {$else} .align 16 {$endif}
@for10: dq $0a0a0a0a0a0a0a0a
dq $0a0a0a0a0a0a0a0a
@for13: dq $0d0d0d0d0d0d0d0d
dq $0d0d0d0d0d0d0d0d
@by16: movdqa xmm2, [rdi + rsi] // check 16 bytes per loop
movdqa xmm3, xmm2
pcmpeqb xmm2, xmm0
pcmpeqb xmm3, xmm1
por xmm3, xmm2
pmovmskb eax, xmm3
test eax, eax
jnz @found
add rdi, 16
jnc @by16
@fail: mov rax, r8 // returns TextLen if no CR/LF found
{$ifdef MSWINDOWS}
pop rdi
pop rsi
{$endif}ret
@found: bsf eax, eax
add rax, rdi
jc @fail
add rax, r8
{$ifdef MSWINDOWS}
pop rdi
pop rsi
{$endif}
end;
{$else} {$ifdef FPC}inline;{$endif}
var c: cardinal;
begin
result := 0;
dec(PtrInt(TextEnd),PtrInt(Text)); // compute TextLen
if TextEnd<>nil then
repeat
c := ord(Text[result]);
if c>13 then begin
inc(result);
if result>=PtrInt(PtrUInt(TextEnd)) then
break;
continue;
end;
if (c=10) or (c=13) then
break;
inc(result);
if result>=PtrInt(PtrUInt(TextEnd)) then
break;
until false;
end;
{$endif CPUX64}
It will be faster than any UTF-8 decoding for sure.
I already hear some people say: "hey, this is premature optimization! the disk is the bottleneck!".
But in 2020, my 1TB SSD reads at more than 3GB/s - https://www.sabrent.com/rocket
This is real numbers on my laptop. So searching at GB/s speed does make sense.
We use similar techniques at https://www.livemon.com/features/log-management
With optimized compression, and distributed search, we reach TB/s brute force speed.