File size: 3,000 Bytes
371d12f
 
 
3c95cb7
 
371d12f
 
 
ca7b30b
3c95cb7
 
 
 
 
 
 
 
 
 
4ac7b8c
ca7b30b
73e5e02
ca7b30b
 
 
 
 
 
 
 
 
371d12f
 
 
 
3c95cb7
371d12f
 
 
3c95cb7
 
 
 
 
371d12f
3c95cb7
371d12f
 
4ac7b8c
3c95cb7
371d12f
 
 
3c95cb7
 
 
371d12f
 
 
 
 
 
 
 
 
 
 
 
 
3c95cb7
371d12f
3c95cb7
 
 
 
 
 
 
371d12f
3c95cb7
 
371d12f
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import torch
from datetime import datetime
from threading import Thread
from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
from typing import List

from app.config import config

SYSTEM_PROMPT = (
    "You are a helpful multilingual assistant supporting Yoruba, Igbo, Hausa, and English. "
    "Respond clearly and directly. Do not repeat the user's question. "
    "Do not include any prompt, preamble, or meta-commentary in your response. "
    "You are called FarmLingua AI that is you name. "
    "You are an agricultural expert on all kinds of farming. "
    "You are here to help Nigerian farmers."
    "do not answer question on any other field if it not about farming , farm business."
    "users might not complete a question always understand smartly the intent of the question and answer."
    "so always tell the users after answering their questions that kawafarm LTD built you for farmers."
    "make sure you answer the users questions very well ."
)


class LLMEngine:
    def __init__(self):
        token = config.HF_TOKEN or None

        self.tokenizer = AutoTokenizer.from_pretrained(
            config.LLM_MODEL,
            token=token,
        )
        self.model = AutoModelForCausalLM.from_pretrained(
            config.LLM_MODEL,
            dtype=torch.float16,
            device_map="auto",
            token=token,
            # N-ATLaS ships sharded safetensors — do not set use_safetensors=False
        )
        self.model.eval()

    def _build_messages(self, history: List[dict], user_message: str) -> List[dict]:
        messages = [{"role": "system", "content": SYSTEM_PROMPT}]
        messages.extend(history)
        messages.append({"role": "user", "content": user_message})
        return messages

    def _format_prompt(self, messages: List[dict]) -> str:
        return self.tokenizer.apply_chat_template(
            messages,
            add_generation_prompt=True,
            tokenize=False,
            date_string=datetime.now().strftime("%d %b %Y"),
        )

    def stream(self, history: List[dict], user_message: str):
        messages = self._build_messages(history, user_message)
        prompt   = self._format_prompt(messages)

        inputs = self.tokenizer(
            prompt,
            return_tensors="pt",
            add_special_tokens=False,
        ).to(self.model.device)

        streamer = TextIteratorStreamer(
            self.tokenizer,
            skip_prompt=True,
            skip_special_tokens=True,
        )

        generation_kwargs = dict(
            **inputs,
            streamer=streamer,
            max_new_tokens=config.LLM_MAX_NEW_TOKENS,
            temperature=config.LLM_TEMPERATURE,
            repetition_penalty=config.LLM_REPETITION_PENALTY,
            use_cache=True,
            do_sample=True,
        )

        thread = Thread(target=self.model.generate, kwargs=generation_kwargs)
        thread.start()
        return streamer


llm_engine = LLMEngine()