What you want is called a "Lexer" or "Parser" (depends on whether or not it is persistent), and should not be hard to get actually. Below you find one that I just wrote myself, feel free to use/modify/delete/eat/drink it or do whatever you want with it if it helps you (untested!).
unit Lexer;
interface
uses
System.SysUtils, System.Types, System.Generics.Collections;
type
TLexer = class abstract
public type
ETokenError = class(Exception);
TToken = record
private
FText: String;
FPosition: TPoint;
FKind: Byte;
public
property Text: String read FText;
property Position: TPoint read FPosition;
property Kind: Byte read FKind;
end;
private
FTokens: TList<TToken>;
function GetTokenCount: Integer;
function GetTokens(const AIndex: Integer): TToken;
protected
// Check if end of text is reached
function EndsText(const AChar: PChar): Boolean; virtual;
// Check if end of line is reached
function BreaksLine(const AChar: PChar): Boolean; virtual;
// Check if Char is valid (abort if not)
function IsValidChar(const AChar: Char; const AKind: Byte): Boolean; virtual;
// Get kind of new token
function TokenKind(var AChar: PChar): Byte; virtual; abstract;
// Check if token ends here
function EndsToken(var AChar: PChar; const AKind: Byte): Boolean; virtual; abstract;
// Convert token kind if necessary
procedure ConvertToken(var AChar: PChar; var AKind: Byte); virtual; abstract;
public
property Tokens[const AIndex: Integer]: TToken read GetTokens;
property TokenCount: Integer read GetTokenCount;
constructor Create(const AText: String);
destructor Destroy; override;
end;
implementation
{ TLexer }
function TLexer.BreaksLine(const AChar: PChar): Boolean;
begin
Result := String.Create([AChar[0], AChar[1]]).Equals(sLineBreak);
end;
constructor TLexer.Create(const AText: String);
procedure Parse;
var
Current: PChar;
Previous: PChar;
Token: TToken;
Kind: Byte;
Position: TPoint;
StringBuilder: TStringBuilder;
begin
Position := Default(TPoint);
Current := PChar(AText);
StringBuilder := TStringBuilder.Create;
try
while not EndsText(Current) do
begin
Previous := Current;
Kind := TokenKind(Current);
while not (EndsText(Current) or EndsToken(Current, Kind)) do
begin
if not IsValidChar(Current[0], Kind) then
begin
raise ETokenError.CreateFmt('Invalid character: ', [String.Create([Current[0]]).QuotedString]);
end;
ConvertToken(Current, Kind);
if BreaksLine(Current) then
begin
Inc(Position.Y);
end;
StringBuilder.Append(Current);
Inc(Current);
end;
Token.FText := StringBuilder.ToString;
Token.FKind := Kind;
Inc(Position.X, (Current - Previous) div SizeOf(Char));
Token.FPosition := Position;
StringBuilder.Clear;
FTokens.Add(Token);
end;
finally
StringBuilder.Free;
end;
end;
begin
inherited Create;
FTokens := TList<TToken>.Create;
Parse;
end;
destructor TLexer.Destroy;
begin
FTokens.Free;
inherited;
end;
function TLexer.EndsText(const AChar: PChar): Boolean;
begin
Result := AChar[0] = #0;
end;
function TLexer.GetTokenCount: Integer;
begin
Result := FTokens.Count;
end;
function TLexer.GetTokens(const AIndex: Integer): TToken;
begin
Result := FTokens[AIndex];
end;
function TLexer.IsValidChar(const AChar: Char; const AKind: Byte): Boolean;
begin
Result := CharInSet(AChar, [Low(AnsiChar) .. High(AnsiChar)]);
end;
end.
Note however, that this is not optimal performance-wise.