Spaces:

atkiya110
/

rag_server

Running

App Files Files Community

atkiya110 commited on Mar 14

Commit

4b5424e

verified ·

1 Parent(s): b736313

Update app.py

Browse files

Files changed (1) hide show

app.py +44 -52

app.py CHANGED Viewed

@@ -45,8 +45,7 @@ API_HEADERS   = {"x-api-key": API_KEY}
 GITHUB_BASE   = "https://raw.githubusercontent.com/Atkiya/jsonfiles/main/"
 EMBED_MODEL   = "sentence-transformers/all-MiniLM-L6-v2"
-HF_TOKEN      = os.getenv("HF_TOKEN", "")  # Required for gated Meta models
-GEN_MODEL     = "meta-llama/Llama-3.2-1B-Instruct"
 CHUNK_SIZE    = 400
 CHUNK_OVERLAP = 80
@@ -100,7 +99,7 @@ GITHUB_FILES = [
 class AppState:
     embedder    = None
-    generator   = None   # Llama 3.2 pipeline
     documents   : list = []
     faiss_index = None
     bm25        = None
@@ -335,98 +334,91 @@ async def async_hybrid_search(query, k=5):
     return await asyncio.to_thread(hybrid_search, query, k)
 # ─────────────────────────────────────────────
-# GENERATION  — Llama 3.2-1B-Instruct (local, no API cost)
 # ─────────────────────────────────────────────
-SYSTEM_PROMPT = (
-    "You are a helpful assistant for East West University (EWU). "
-    "Answer using ONLY the context provided. "
-    "If the context does not contain enough information, say so honestly. "
-    "Be concise and accurate. Do not repeat the context."
-)
-def _run_llama(query: str, context: str) -> str:
     """
-    Synchronous Llama 3.2 call.
-    Uses the standard chat template (same as Llama 3.1).
-    Always call via asyncio.to_thread — never directly from async code.
     """
     if state.generator is None:
-        return "[Generator not loaded — check HF_TOKEN secret is set in Space settings]"
-    # Llama 3.2 uses standard chat template — same as Llama 3.1
-    # Trim context to ~1500 chars so total prompt fits in the 4096-token window
-    trimmed_context = context[:1500] + ("…" if len(context) > 1500 else "")
-    messages = [
-        {"role": "system",  "content": SYSTEM_PROMPT},
-        {"role": "user",    "content": f"Context:\n{trimmed_context}\n\nQuestion: {query}"},
-    ]
     try:
         outputs = state.generator(
-            messages,
-            max_new_tokens=512,
-            do_sample=True,
-            temperature=0.3,    # low = more factual, less hallucination
-            top_p=0.9,
             repetition_penalty=1.1,
         )
-        # transformers pipeline returns list of dicts with generated_text
-        generated = outputs[0]["generated_text"]
-        # generated_text is the full conversation list; grab the last assistant turn
-        if isinstance(generated, list):
-            for turn in reversed(generated):
-                if isinstance(turn, dict) and turn.get("role") == "assistant":
-                    return turn.get("content", "").strip()
-        # Fallback: return raw string
-        return str(generated).strip()
     except Exception as e:
-        print(f"[ERROR] Llama inference: {e}")
         return f"[Generation error: {e}]"
 async def generate(query: str, context: str) -> str:
-    """Async wrapper — runs Llama in a thread so the event loop stays free."""
-    return await asyncio.to_thread(_run_llama, query, context)
 # ────────────────────────��────────────────────
 # BOOT
 # ─────────────────────────────────────────────
 def _load_generator():
-    """Load Llama 3.2-1B-Instruct pipeline. Runs in a thread during boot."""
     if not HF_OK:
         print("[WARN] transformers unavailable — generation disabled.")
         return None
     try:
-        if not HF_TOKEN:
-            print("[WARN] HF_TOKEN not set — cannot load gated Llama model.")
-            return None
-        print(f"  Loading Llama 3.2-1B-Instruct on {DEVICE}…")
-        from huggingface_hub import login
-        login(token=HF_TOKEN, add_to_git_credential=False)
         gen = hf_pipeline(
             "text-generation",
             model=GEN_MODEL,
             device=0 if DEVICE == "cuda" else -1,
             dtype="auto",
-            token=HF_TOKEN,
         )
-        print("  Llama 3.2-1B-Instruct ready.")
         return gen
     except Exception as e:
-        print(f"[ERROR] Could not load Llama 3.2: {e}")
         return None
 async def _boot():
     try:
         # 1. Load both models concurrently in threads
-        print(f"Loading models on {DEVICE}… (Llama 3.2-1B-Instruct + MiniLM)")
         state.embedder, state.generator = await asyncio.gather(
             asyncio.to_thread(SentenceTransformer, EMBED_MODEL, device=DEVICE) if ST_OK
                 else asyncio.to_thread(lambda: None),
@@ -514,7 +506,7 @@ async def health():
         "device"    : DEVICE,
         "faiss"     : state.faiss_index is not None,
         "bm25"      : state.bm25 is not None,
-        "generator" : state.generator is not None,
         "error"     : state.error or None,
     })

 GITHUB_BASE   = "https://raw.githubusercontent.com/Atkiya/jsonfiles/main/"
 EMBED_MODEL   = "sentence-transformers/all-MiniLM-L6-v2"
+GEN_MODEL     = "microsoft/phi-2"
 CHUNK_SIZE    = 400
 CHUNK_OVERLAP = 80
 class AppState:
     embedder    = None
+    generator   = None   # Phi-2 pipeline
     documents   : list = []
     faiss_index = None
     bm25        = None
     return await asyncio.to_thread(hybrid_search, query, k)
 # ─────────────────────────────────────────────
+# GENERATION  — Phi-2 (local, no API cost, no token needed)
 # ─────────────────────────────────────────────
+def _build_phi2_prompt(query: str, context: str) -> str:
+    """
+    Phi-2 is a completion model, not a chat model.
+    It responds best to an Instruct-style plain-text prompt.
+    Trim context to ~1800 chars to stay within Phi-2's 2048-token window.
+    """
+    trimmed = context[:1800] + ("…" if len(context) > 1800 else "")
+    return (
+        "You are a helpful assistant for East West University (EWU).\n"
+        "Answer using ONLY the information in the context below.\n"
+        "If the context does not contain enough information, say so honestly.\n"
+        "Be concise and accurate.\n\n"
+        f"Context:\n{trimmed}\n\n"
+        f"Question: {query}\n"
+        "Answer:"
+    )
+def _run_phi2(query: str, context: str) -> str:
     """
+    Synchronous Phi-2 call — always call via asyncio.to_thread.
+    Phi-2 is a plain completion model; we pass a formatted string, not messages.
     """
     if state.generator is None:
+        return "[Generator not loaded]"
+    prompt = _build_phi2_prompt(query, context)
     try:
         outputs = state.generator(
+            prompt,
+            max_new_tokens=256,
+            do_sample=False,          # greedy = more factual for QA
             repetition_penalty=1.1,
+            return_full_text=False,   # return only the generated part, not the prompt
         )
+        answer = outputs[0]["generated_text"].strip()
+        # Phi-2 sometimes continues past the answer — cut at a second "Question:" if present
+        if "\nQuestion:" in answer:
+            answer = answer.split("\nQuestion:")[0].strip()
+        return answer or "[Empty response]"
     except Exception as e:
+        print(f"[ERROR] Phi-2 inference: {e}")
         return f"[Generation error: {e}]"
 async def generate(query: str, context: str) -> str:
+    """Async wrapper — runs Phi-2 in a thread so the event loop stays free."""
+    return await asyncio.to_thread(_run_phi2, query, context)
 # ────────────────────────��────────────────────
 # BOOT
 # ─────────────────────────────────────────────
 def _load_generator():
+    """Load Phi-2 pipeline. Runs in a thread during boot. No token required."""
     if not HF_OK:
         print("[WARN] transformers unavailable — generation disabled.")
         return None
     try:
+        print(f"  Loading Phi-2 on {DEVICE}…")
         gen = hf_pipeline(
             "text-generation",
             model=GEN_MODEL,
             device=0 if DEVICE == "cuda" else -1,
             dtype="auto",
+            trust_remote_code=True,   # required for Phi-2
         )
+        print("  Phi-2 ready.")
         return gen
     except Exception as e:
+        print(f"[ERROR] Could not load Phi-2: {e}")
         return None
 async def _boot():
     try:
         # 1. Load both models concurrently in threads
+        print(f"Loading models on {DEVICE}… (Phi-2 + MiniLM)")
         state.embedder, state.generator = await asyncio.gather(
             asyncio.to_thread(SentenceTransformer, EMBED_MODEL, device=DEVICE) if ST_OK
                 else asyncio.to_thread(lambda: None),
         "device"    : DEVICE,
         "faiss"     : state.faiss_index is not None,
         "bm25"      : state.bm25 is not None,
+        "generator" : "phi-2" if state.generator is not None else None,
         "error"     : state.error or None,
     })