atkiya110 commited on
Commit
4b5424e
Β·
verified Β·
1 Parent(s): b736313

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +44 -52
app.py CHANGED
@@ -45,8 +45,7 @@ API_HEADERS = {"x-api-key": API_KEY}
45
 
46
  GITHUB_BASE = "https://raw.githubusercontent.com/Atkiya/jsonfiles/main/"
47
  EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
48
- HF_TOKEN = os.getenv("HF_TOKEN", "") # Required for gated Meta models
49
- GEN_MODEL = "meta-llama/Llama-3.2-1B-Instruct"
50
 
51
  CHUNK_SIZE = 400
52
  CHUNK_OVERLAP = 80
@@ -100,7 +99,7 @@ GITHUB_FILES = [
100
 
101
  class AppState:
102
  embedder = None
103
- generator = None # Llama 3.2 pipeline
104
  documents : list = []
105
  faiss_index = None
106
  bm25 = None
@@ -335,98 +334,91 @@ async def async_hybrid_search(query, k=5):
335
  return await asyncio.to_thread(hybrid_search, query, k)
336
 
337
  # ─────────────────────────────────────────────
338
- # GENERATION β€” Llama 3.2-1B-Instruct (local, no API cost)
339
  # ─────────────────────────────────────────────
340
 
341
- SYSTEM_PROMPT = (
342
- "You are a helpful assistant for East West University (EWU). "
343
- "Answer using ONLY the context provided. "
344
- "If the context does not contain enough information, say so honestly. "
345
- "Be concise and accurate. Do not repeat the context."
346
- )
 
 
 
 
 
 
 
 
 
 
347
 
348
- def _run_llama(query: str, context: str) -> str:
 
349
  """
350
- Synchronous Llama 3.2 call.
351
- Uses the standard chat template (same as Llama 3.1).
352
- Always call via asyncio.to_thread β€” never directly from async code.
353
  """
354
  if state.generator is None:
355
- return "[Generator not loaded β€” check HF_TOKEN secret is set in Space settings]"
356
-
357
- # Llama 3.2 uses standard chat template β€” same as Llama 3.1
358
- # Trim context to ~1500 chars so total prompt fits in the 4096-token window
359
- trimmed_context = context[:1500] + ("…" if len(context) > 1500 else "")
360
 
361
- messages = [
362
- {"role": "system", "content": SYSTEM_PROMPT},
363
- {"role": "user", "content": f"Context:\n{trimmed_context}\n\nQuestion: {query}"},
364
- ]
365
 
366
  try:
367
  outputs = state.generator(
368
- messages,
369
- max_new_tokens=512,
370
- do_sample=True,
371
- temperature=0.3, # low = more factual, less hallucination
372
- top_p=0.9,
373
  repetition_penalty=1.1,
 
374
  )
375
- # transformers pipeline returns list of dicts with generated_text
376
- generated = outputs[0]["generated_text"]
377
 
378
- # generated_text is the full conversation list; grab the last assistant turn
379
- if isinstance(generated, list):
380
- for turn in reversed(generated):
381
- if isinstance(turn, dict) and turn.get("role") == "assistant":
382
- return turn.get("content", "").strip()
383
 
384
- # Fallback: return raw string
385
- return str(generated).strip()
386
 
387
  except Exception as e:
388
- print(f"[ERROR] Llama inference: {e}")
389
  return f"[Generation error: {e}]"
390
 
391
 
392
  async def generate(query: str, context: str) -> str:
393
- """Async wrapper β€” runs Llama in a thread so the event loop stays free."""
394
- return await asyncio.to_thread(_run_llama, query, context)
395
 
396
  # ────────────────────────��────────────────────
397
  # BOOT
398
  # ─────────────────────────────────────────────
399
 
400
  def _load_generator():
401
- """Load Llama 3.2-1B-Instruct pipeline. Runs in a thread during boot."""
402
  if not HF_OK:
403
  print("[WARN] transformers unavailable β€” generation disabled.")
404
  return None
405
  try:
406
- if not HF_TOKEN:
407
- print("[WARN] HF_TOKEN not set β€” cannot load gated Llama model.")
408
- return None
409
- print(f" Loading Llama 3.2-1B-Instruct on {DEVICE}…")
410
- from huggingface_hub import login
411
- login(token=HF_TOKEN, add_to_git_credential=False)
412
  gen = hf_pipeline(
413
  "text-generation",
414
  model=GEN_MODEL,
415
  device=0 if DEVICE == "cuda" else -1,
416
  dtype="auto",
417
- token=HF_TOKEN,
418
  )
419
- print(" Llama 3.2-1B-Instruct ready.")
420
  return gen
421
  except Exception as e:
422
- print(f"[ERROR] Could not load Llama 3.2: {e}")
423
  return None
424
 
425
 
426
  async def _boot():
427
  try:
428
  # 1. Load both models concurrently in threads
429
- print(f"Loading models on {DEVICE}… (Llama 3.2-1B-Instruct + MiniLM)")
430
  state.embedder, state.generator = await asyncio.gather(
431
  asyncio.to_thread(SentenceTransformer, EMBED_MODEL, device=DEVICE) if ST_OK
432
  else asyncio.to_thread(lambda: None),
@@ -514,7 +506,7 @@ async def health():
514
  "device" : DEVICE,
515
  "faiss" : state.faiss_index is not None,
516
  "bm25" : state.bm25 is not None,
517
- "generator" : state.generator is not None,
518
  "error" : state.error or None,
519
  })
520
 
 
45
 
46
  GITHUB_BASE = "https://raw.githubusercontent.com/Atkiya/jsonfiles/main/"
47
  EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
48
+ GEN_MODEL = "microsoft/phi-2"
 
49
 
50
  CHUNK_SIZE = 400
51
  CHUNK_OVERLAP = 80
 
99
 
100
  class AppState:
101
  embedder = None
102
+ generator = None # Phi-2 pipeline
103
  documents : list = []
104
  faiss_index = None
105
  bm25 = None
 
334
  return await asyncio.to_thread(hybrid_search, query, k)
335
 
336
  # ─────────────────────────────────────────────
337
+ # GENERATION β€” Phi-2 (local, no API cost, no token needed)
338
  # ─────────────────────────────────────────────
339
 
340
+ def _build_phi2_prompt(query: str, context: str) -> str:
341
+ """
342
+ Phi-2 is a completion model, not a chat model.
343
+ It responds best to an Instruct-style plain-text prompt.
344
+ Trim context to ~1800 chars to stay within Phi-2's 2048-token window.
345
+ """
346
+ trimmed = context[:1800] + ("…" if len(context) > 1800 else "")
347
+ return (
348
+ "You are a helpful assistant for East West University (EWU).\n"
349
+ "Answer using ONLY the information in the context below.\n"
350
+ "If the context does not contain enough information, say so honestly.\n"
351
+ "Be concise and accurate.\n\n"
352
+ f"Context:\n{trimmed}\n\n"
353
+ f"Question: {query}\n"
354
+ "Answer:"
355
+ )
356
 
357
+
358
+ def _run_phi2(query: str, context: str) -> str:
359
  """
360
+ Synchronous Phi-2 call β€” always call via asyncio.to_thread.
361
+ Phi-2 is a plain completion model; we pass a formatted string, not messages.
 
362
  """
363
  if state.generator is None:
364
+ return "[Generator not loaded]"
 
 
 
 
365
 
366
+ prompt = _build_phi2_prompt(query, context)
 
 
 
367
 
368
  try:
369
  outputs = state.generator(
370
+ prompt,
371
+ max_new_tokens=256,
372
+ do_sample=False, # greedy = more factual for QA
 
 
373
  repetition_penalty=1.1,
374
+ return_full_text=False, # return only the generated part, not the prompt
375
  )
376
+ answer = outputs[0]["generated_text"].strip()
 
377
 
378
+ # Phi-2 sometimes continues past the answer β€” cut at a second "Question:" if present
379
+ if "\nQuestion:" in answer:
380
+ answer = answer.split("\nQuestion:")[0].strip()
 
 
381
 
382
+ return answer or "[Empty response]"
 
383
 
384
  except Exception as e:
385
+ print(f"[ERROR] Phi-2 inference: {e}")
386
  return f"[Generation error: {e}]"
387
 
388
 
389
  async def generate(query: str, context: str) -> str:
390
+ """Async wrapper β€” runs Phi-2 in a thread so the event loop stays free."""
391
+ return await asyncio.to_thread(_run_phi2, query, context)
392
 
393
  # ────────────────────────��────────────────────
394
  # BOOT
395
  # ─────────────────────────────────────────────
396
 
397
  def _load_generator():
398
+ """Load Phi-2 pipeline. Runs in a thread during boot. No token required."""
399
  if not HF_OK:
400
  print("[WARN] transformers unavailable β€” generation disabled.")
401
  return None
402
  try:
403
+ print(f" Loading Phi-2 on {DEVICE}…")
 
 
 
 
 
404
  gen = hf_pipeline(
405
  "text-generation",
406
  model=GEN_MODEL,
407
  device=0 if DEVICE == "cuda" else -1,
408
  dtype="auto",
409
+ trust_remote_code=True, # required for Phi-2
410
  )
411
+ print(" Phi-2 ready.")
412
  return gen
413
  except Exception as e:
414
+ print(f"[ERROR] Could not load Phi-2: {e}")
415
  return None
416
 
417
 
418
  async def _boot():
419
  try:
420
  # 1. Load both models concurrently in threads
421
+ print(f"Loading models on {DEVICE}… (Phi-2 + MiniLM)")
422
  state.embedder, state.generator = await asyncio.gather(
423
  asyncio.to_thread(SentenceTransformer, EMBED_MODEL, device=DEVICE) if ST_OK
424
  else asyncio.to_thread(lambda: None),
 
506
  "device" : DEVICE,
507
  "faiss" : state.faiss_index is not None,
508
  "bm25" : state.bm25 is not None,
509
+ "generator" : "phi-2" if state.generator is not None else None,
510
  "error" : state.error or None,
511
  })
512