Anshul Prasad commited on
Commit
384c26a
·
1 Parent(s): 0489d07

feat: Switch from TinyLlama to Phi-3-mini for larger context window

Browse files

- Replace TinyLlama 1.1B (2K context) with Phi-3-mini (4K context)
- Allows MAX_CONTEXT_TOKENS up to 10,000 (was limited to 800)
- Phi-3-mini: 2.4GB, faster, better quality
- Use Phi-3 chat format with <|user|> and <|assistant|> tags
- Increase max_tokens from 1000 to 2000 for better answers
- All within HF Spaces constraints (16GB RAM, 2CPU)"

Files changed (2) hide show
  1. api/generate_response.py +14 -7
  2. config.py +1 -1
api/generate_response.py CHANGED
@@ -12,20 +12,21 @@ llm = None
12
  def load_model_at_startup():
13
  global llm
14
  try:
15
- logger.info("Loading model into RAM...")
16
 
17
  llm = Llama.from_pretrained(
18
- repo_id="TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
19
- filename="tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
20
  verbose=False,
21
  n_gpu_layers=0, # CPU only (safe for HF Spaces)
22
- n_ctx=2048,
23
  )
24
- logger.info("Model loaded into RAM successfully.")
25
 
26
  except Exception as e:
27
  logger.error("Failed to load model: %s", e)
28
  llm = None
 
29
  def generate_response(query: str, context: str) -> str:
30
 
31
  if llm is None:
@@ -37,12 +38,18 @@ def generate_response(query: str, context: str) -> str:
37
  try:
38
  answer = llm(
39
  f"[SYSTEM]{SYSTEM_PROMPT}[/SYSTEM]\n{prompt}",
40
- max_tokens=7000,
41
  temperature=1.0,
42
  top_p=1.0,
43
- stop=["Question:", "Context:"]
 
44
  )
45
  answer = answer["choices"][0]["text"].strip()
 
 
 
 
 
46
  logging.info('Answer Generation Succeeded.')
47
  return answer
48
 
 
12
  def load_model_at_startup():
13
  global llm
14
  try:
15
+ logger.info("Loading Phi-3-mini model into RAM...")
16
 
17
  llm = Llama.from_pretrained(
18
+ repo_id="microsoft/Phi-3-mini-4k-instruct-gguf",
19
+ filename="Phi-3-mini-4k-instruct-Q4_K_M.gguf",
20
  verbose=False,
21
  n_gpu_layers=0, # CPU only (safe for HF Spaces)
22
+ n_ctx=4096,
23
  )
24
+ logger.info("Phi-3-mini model loaded into RAM successfully.")
25
 
26
  except Exception as e:
27
  logger.error("Failed to load model: %s", e)
28
  llm = None
29
+
30
  def generate_response(query: str, context: str) -> str:
31
 
32
  if llm is None:
 
38
  try:
39
  answer = llm(
40
  f"[SYSTEM]{SYSTEM_PROMPT}[/SYSTEM]\n{prompt}",
41
+ max_tokens=2000,
42
  temperature=1.0,
43
  top_p=1.0,
44
+ stop=["<|end|>", "Question:", "<|user|>"],
45
+ echo=False
46
  )
47
  answer = answer["choices"][0]["text"].strip()
48
+
49
+ if not answer:
50
+ logger.warning("Failed to generate response. Returning empty response.")
51
+ return "I couldn't generate response. Please try again."
52
+
53
  logging.info('Answer Generation Succeeded.')
54
  return answer
55
 
config.py CHANGED
@@ -15,7 +15,7 @@ RETRIEVED_TRANSCRIPTS_FILE = Path("outputs/retrieved_transcripts.txt")
15
  RESPONSE_FILE = Path("outputs/generated_response.txt")
16
  COOKIES_FILE = Path("utils/youtube_cookies.txt")
17
 
18
- MAX_CONTEXT_TOKENS = 7000
19
 
20
  SYSTEM_PROMPT = """
21
  You are speaking as Acharya Prashant.
 
15
  RESPONSE_FILE = Path("outputs/generated_response.txt")
16
  COOKIES_FILE = Path("utils/youtube_cookies.txt")
17
 
18
+ MAX_CONTEXT_TOKENS = 10000
19
 
20
  SYSTEM_PROMPT = """
21
  You are speaking as Acharya Prashant.