Dllama, a simple and easy to use library for doing local LLM inference directly from Delphi (any language with bindings). It can load GGUF formatted LLMs into CPU or GPU memory. Uses Vulkan back end for acceleration.
Simple Example
uses
System.SysUtils,
Dllama,
Dllama.Ext;
var
LResponse: string;
LTokenInputSpeed: Single;
LTokenOutputSpeed: Single;
LInputTokens: Integer;
LOutputTokens: Integer;
LTotalTokens: Integer;
begin
// init config
Dllama_InitConfig('C:\LLM\gguf', -1, False, VK_ESCAPE);
// add model
Dllama_AddModel('Meta-Llama-3-8B-Instruct-Q6_K', 'llama3', 1024*8, '<|start_header_id|>%s %s<|end_header_id|>',
'\n assistant:\n', ['<|eot_id|>', 'assistant']);
// add messages
Dllama_AddMessage(ROLE_SYSTEM, 'you are Dllama, a helpful AI assistant.');
Dllama_AddMessage(ROLE_USER, 'who are you?');
// display the user prompt
Dllama_Console_PrintLn(Dllama_GetLastUserMessage(), [], DARKGREEN);
// do inference
if Dllama_Inference('llama3', LResponse) then
begin
// display usage
Dllama_Console_PrintLn(CRLF, [], WHITE);
Dllama_GetInferenceUsage(@LTokenInputSpeed, @LTokenOutputSpeed, @LInputTokens, @LOutputTokens,
@LTotalTokens);
Dllama_Console_PrintLn('Tokens :: Input: %d, Output: %d, Total: %d, Speed: %3.1f t/s',
[LInputTokens, LOutputTokens, LTotalTokens, LTokenOutputSpeed], BRIGHTYELLOW);
end
else
begin
Dllama_Console_PrintLn('Error: %s', [Dllama_GetError()], RED);
end;
Dllama_UnloadModel();
end.