Anshul Prasad commited on
Commit
85673b3
·
1 Parent(s): 26cac24

feat: Replace OpenAI API with local TinyLlama inference

Browse files

- Use llama-cpp-python for local model loading
- Load model to RAM at startup
- CPU-only inference (n_gpu_layers=0)

Files changed (1) hide show
  1. api/generate_response.py +33 -45
api/generate_response.py CHANGED
@@ -1,63 +1,51 @@
1
  import logging
2
- from openai import OpenAI
3
- import tiktoken
4
 
5
  from utils.token import count_tokens
6
- from config import API_URL, MODEL, GH_API_TOKEN, SYSTEM_PROMPT
7
 
8
  logger = logging.getLogger(__name__)
9
 
10
- try:
11
- encoder = tiktoken.encoding_for_model(MODEL)
12
- except KeyError:
13
- # fallback for custom or unrecognized model names
14
- encoder = tiktoken.get_encoding("cl100k_base")
15
-
16
- try:
17
- client = OpenAI(base_url=API_URL, api_key=GH_API_TOKEN, timeout=60)
18
- logging.info("OpenAI client initialized.")
19
- except Exception as e:
20
- logging.critical("Failed to initialize OpenAI client as %s", e)
21
- client = None
22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
 
 
 
24
  def generate_response(query: str, context: str) -> str:
25
 
26
- if client is None:
27
- return "Error: AI client not configured."
28
 
29
  prompt = f"Context:\n{context}\n\nQuestion: {query}\nAnswer:"
30
  logging.info("Total number of tokens in prompt: %s", count_tokens(prompt))
31
 
32
  try:
33
-
34
- response = client.chat.completions.create(
35
- messages=[
36
- {
37
- "role": "system",
38
- "content": SYSTEM_PROMPT,
39
- },
40
- {
41
- "role": "user",
42
- "content": prompt,
43
- },
44
- ],
45
- temperature=1,
46
- top_p=1,
47
- model=MODEL,
48
- stream=False,
49
  )
50
-
51
- # Extract text defensively (depends on SDK return shape)
52
- try:
53
- response = response.choices[0].message.content
54
- except Exception as e:
55
- response = getattr(response, "text", None) or str(response)
56
- logging.warning("Fallback used for response parsing as %s", e)
57
-
58
- logging.info("Answer generation succeeded.")
59
- return response
60
 
61
  except Exception as e:
62
- logging.error("Error during API call as %s", e)
63
- return "Sorry, there was an error generating the response."
 
1
  import logging
2
+ import os
3
+ from llama_cpp import Llama
4
 
5
  from utils.token import count_tokens
6
+ from config import SYSTEM_PROMPT
7
 
8
  logger = logging.getLogger(__name__)
9
 
10
+ llm = None
 
 
 
 
 
 
 
 
 
 
 
11
 
12
+ def load_model_at_startup():
13
+ global llm
14
+ try:
15
+ logger.info("Loading model into RAM...")
16
+
17
+ llm = Llama.from_pretrained(
18
+ repo_id="TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
19
+ filename="tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
20
+ verbose=False,
21
+ n_gpu_layers=0, # CPU only (safe for HF Spaces)
22
+ n_ctx=2048,
23
+ )
24
+ logger.info("Model loaded into RAM successfully.")
25
 
26
+ except Exception as e:
27
+ logger.error("Failed to load model: %s", e)
28
+ llm = None
29
  def generate_response(query: str, context: str) -> str:
30
 
31
+ if llm is None:
32
+ return "Error: Model not loaded.."
33
 
34
  prompt = f"Context:\n{context}\n\nQuestion: {query}\nAnswer:"
35
  logging.info("Total number of tokens in prompt: %s", count_tokens(prompt))
36
 
37
  try:
38
+ answer = llm(
39
+ f"[SYSTEM]{SYSTEM_PROMPT}[/SYSTEM]\n{prompt}",
40
+ max_tokens=7000,
41
+ temperature=1.0,
42
+ top_p=1.0,
43
+ stop=["Question:", "Context:"]
 
 
 
 
 
 
 
 
 
 
44
  )
45
+ answer = answer["choices"][0]["text"].strip()
46
+ logging.info('Answer Generation Succeeded.')
47
+ return answer
 
 
 
 
 
 
 
48
 
49
  except Exception as e:
50
+ logging.error("Error during inference",)
51
+ return "Sorry, there was an error generating the response."