tinyBigGAMES 37 Posted April 24 Dllama, a simple and easy to use library for doing local LLM inference directly from Delphi (any language with bindings). It can load GGUF formatted LLMs into CPU or GPU memory. Uses Vulkan back end for acceleration. Simple Example uses System.SysUtils, Dllama, Dllama.Ext; var LResponse: string; LTokenInputSpeed: Single; LTokenOutputSpeed: Single; LInputTokens: Integer; LOutputTokens: Integer; LTotalTokens: Integer; begin // init config Dllama_InitConfig('C:\LLM\gguf', -1, False, VK_ESCAPE); // add model Dllama_AddModel('Meta-Llama-3-8B-Instruct-Q6_K', 'llama3', 1024*8, '<|start_header_id|>%s %s<|end_header_id|>', '\n assistant:\n', ['<|eot_id|>', 'assistant']); // add messages Dllama_AddMessage(ROLE_SYSTEM, 'you are Dllama, a helpful AI assistant.'); Dllama_AddMessage(ROLE_USER, 'who are you?'); // display the user prompt Dllama_Console_PrintLn(Dllama_GetLastUserMessage(), [], DARKGREEN); // do inference if Dllama_Inference('llama3', LResponse) then begin // display usage Dllama_Console_PrintLn(CRLF, [], WHITE); Dllama_GetInferenceUsage(@LTokenInputSpeed, @LTokenOutputSpeed, @LInputTokens, @LOutputTokens, @LTotalTokens); Dllama_Console_PrintLn('Tokens :: Input: %d, Output: %d, Total: %d, Speed: %3.1f t/s', [LInputTokens, LOutputTokens, LTotalTokens, LTokenOutputSpeed], BRIGHTYELLOW); end else begin Dllama_Console_PrintLn('Error: %s', [Dllama_GetError()], RED); end; Dllama_UnloadModel(); end. 2 Share this post Link to post