Spaces:

telcom
/

ResumeQA

Sleeping

App Files Files Community

telcom commited on Jan 19

Commit

4a58f2b

verified ·

1 Parent(s): 359afce

Update app.py

Browse files

Files changed (1) hide show

app.py +90 -62

app.py CHANGED Viewed

@@ -3,24 +3,34 @@ import re
 import gradio as gr
 import numpy as np
 import faiss
 from pypdf import PdfReader
 from docx import Document
 from fastembed import TextEmbedding
-from huggingface_hub import InferenceClient
 # -------------------------
 # Config
 # -------------------------
-HF_TOKEN = os.getenv("HF_TOKEN", "")
-DEFAULT_CHAT_MODEL = os.getenv("CHAT_MODEL_ID", "meta-llama/Meta-Llama-3-8B-Instruct")
-DEFAULT_EMBED_MODEL = os.getenv("EMBED_MODEL_ID", "BAAI/bge-small-en-v1.5")
 TOP_K = int(os.getenv("TOP_K", "5"))
 CHUNK_CHARS = int(os.getenv("CHUNK_CHARS", "1400"))
 CHUNK_OVERLAP = int(os.getenv("CHUNK_OVERLAP", "250"))
 # -------------------------
 # Helpers: file -> text
@@ -112,22 +122,64 @@ def retrieve(query: str, embedder: TextEmbedding, index, chunks, top_k: int = TO
     return hits
 # -------------------------
-# LLM call (HF Inference API)
 # -------------------------
-def make_client():
-    if not HF_TOKEN:
-        return None
-    return InferenceClient(token=HF_TOKEN)
-def build_prompt(question: str, contexts: list):
-    ctx_blocks = []
-    for i, c in enumerate(contexts, start=1):
-        ctx_blocks.append(f"[Source {i} | score={c['score']:.3f}]\n{c['chunk']}")
-    ctx_text = "\n\n".join(ctx_blocks).strip()
-    rules = (
         "You are a resume assistant.\n"
         "Answer ONLY using the provided SOURCES.\n"
         "If the answer is not explicitly supported by the SOURCES, say: "
@@ -136,36 +188,24 @@ def build_prompt(question: str, contexts: list):
         "Keep it concise and professional.\n"
     )
-    return (
-        f"{rules}\n"
-        f"SOURCES:\n{ctx_text}\n\n"
         f"QUESTION:\n{question}\n\n"
         f"ANSWER:"
     )
-def generate_answer_hf(client: InferenceClient, model_id: str, prompt: str):
-    resp = client.text_generation(
-        model=model_id,
-        prompt=prompt,
-        max_new_tokens=320,
-        temperature=0.2,
         top_p=0.9,
-        repetition_penalty=1.05,
-        do_sample=True,
-        return_full_text=False,
     )
-    return (resp or "").strip()
-def format_sources(hits):
-    lines = []
-    for i, h in enumerate(hits, start=1):
-        snippet = re.sub(r"\s+", " ", h["chunk"].strip())
-        if len(snippet) > 260:
-            snippet = snippet[:260] + "..."
-        lines.append(f"- Source {i} (score {h['score']:.3f}): {snippet}")
-    return "\n".join(lines)
 # -------------------------
@@ -178,7 +218,6 @@ class AppState:
         self.chunks = []
         self.ready = False
 STATE = AppState()
@@ -235,7 +274,7 @@ def on_build(file_obj):
         return status_badge(False, "Could not chunk the resume. Try DOCX."), gr.update(interactive=False), []
     try:
-        embedder = TextEmbedding(model_name=DEFAULT_EMBED_MODEL)
         vecs = np.array(list(embedder.embed(chunks)), dtype="float32")
         index = build_faiss_index(vecs)
     except Exception:
@@ -246,10 +285,11 @@ def on_build(file_obj):
     STATE.chunks = chunks
     STATE.ready = True
     return status_badge(True, "Resume loaded. Ask your question below."), gr.update(interactive=True), []
-def on_ask(question, history, chat_model_id):
     history = history or []
     q = (question or "").strip()
     if not q:
@@ -261,19 +301,11 @@ def on_ask(question, history, chat_model_id):
         return history
     hits = retrieve(q, STATE.embedder, STATE.index, STATE.chunks, top_k=TOP_K)
-    prompt = build_prompt(q, hits)
-    client = make_client()
-    if client is None:
-        answer = (
-            "HF_TOKEN is not set, so I cannot call the chat model.\n\n"
-            "Add a Space secret named HF_TOKEN, then try again."
-        )
-    else:
-        try:
-            answer = generate_answer_hf(client, chat_model_id, prompt)
-        except Exception as e:
-            answer = f"Model call failed: {e}"
     final = f"{answer}\n\nSources:\n{format_sources(hits)}"
@@ -295,7 +327,7 @@ with gr.Blocks(title="ResumeQA") as demo:
         <div style="margin-bottom:10px;">
           <div style="font-size:28px;font-weight:900;">ResumeQA</div>
           <div style="opacity:0.82;margin-top:2px;">
-            Upload a resume, then ask questions. Answers stay grounded in the document.
           </div>
         </div>
         """
@@ -306,25 +338,21 @@ with gr.Blocks(title="ResumeQA") as demo:
     uploader = gr.File(label="Upload resume (PDF or DOCX)", file_types=[".pdf", ".docx"], height=90)
     build_btn = gr.Button("Build resume index", variant="primary")
-    # No type/format args, this build uses messages by default
     chatbot = gr.Chatbot(label="Chat", height=430)
     with gr.Row():
         question = gr.Textbox(
             label="Your question",
-            placeholder="Example: What are my strongest skills for a Solution Architect role?",
             interactive=False
         )
         ask_btn = gr.Button("Ask", variant="primary")
     clear_btn = gr.Button("Clear chat", variant="secondary")
-    chat_model = gr.Textbox(value=DEFAULT_CHAT_MODEL, visible=False)
     build_btn.click(fn=on_build, inputs=[uploader], outputs=[status_html, question, chatbot])
-    ask_btn.click(fn=on_ask, inputs=[question, chatbot, chat_model], outputs=[chatbot]).then(lambda: "", None, question)
-    question.submit(fn=on_ask, inputs=[question, chatbot, chat_model], outputs=[chatbot]).then(lambda: "", None, question)
     clear_btn.click(fn=on_clear, inputs=None, outputs=[chatbot])
-demo.queue(default_concurrency_limit=8).launch(css=CSS, ssr_mode=False)

 import gradio as gr
 import numpy as np
 import faiss
+import requests
 from pypdf import PdfReader
 from docx import Document
 from fastembed import TextEmbedding
+from llama_cpp import Llama
 # -------------------------
 # Config
 # -------------------------
+EMBED_MODEL = os.getenv("EMBED_MODEL_ID", "BAAI/bge-small-en-v1.5")
 TOP_K = int(os.getenv("TOP_K", "5"))
 CHUNK_CHARS = int(os.getenv("CHUNK_CHARS", "1400"))
 CHUNK_OVERLAP = int(os.getenv("CHUNK_OVERLAP", "250"))
+MAX_NEW_TOKENS = int(os.getenv("MAX_NEW_TOKENS", "260"))
+TEMPERATURE = float(os.getenv("TEMPERATURE", "0.2"))
+# GGUF model path and optional public download URL
+MODEL_PATH = os.getenv("GGUF_MODEL_PATH", "models/model.gguf")
+MODEL_URL = os.getenv("GGUF_MODEL_URL", "")  # optional, public direct link to a .gguf
+# GPU layers: -1 means "as many as possible"
+N_GPU_LAYERS = int(os.getenv("N_GPU_LAYERS", "-1"))
+N_CTX = int(os.getenv("N_CTX", "4096"))
 # -------------------------
 # Helpers: file -> text
     return hits
+def format_sources(hits):
+    lines = []
+    for i, h in enumerate(hits, start=1):
+        snippet = re.sub(r"\s+", " ", h["chunk"].strip())
+        if len(snippet) > 220:
+            snippet = snippet[:220] + "..."
+        lines.append(f"- Source {i} (score {h['score']:.3f}): {snippet}")
+    return "\n".join(lines)
 # -------------------------
+# Local LLM (llama.cpp)
 # -------------------------
+_LLM = None
+def ensure_model_file():
+    os.makedirs(os.path.dirname(MODEL_PATH) or ".", exist_ok=True)
+    if os.path.exists(MODEL_PATH) and os.path.getsize(MODEL_PATH) > 10_000_000:
+        return
+    if not MODEL_URL:
+        raise RuntimeError(
+            "GGUF model file not found. Set GGUF_MODEL_PATH to an existing .gguf in the repo, "
+            "or provide GGUF_MODEL_URL (public direct link to a .gguf)."
+        )
+    # Download the model once
+    with requests.get(MODEL_URL, stream=True, timeout=120) as r:
+        r.raise_for_status()
+        with open(MODEL_PATH, "wb") as f:
+            for chunk in r.iter_content(chunk_size=1024 * 1024):
+                if chunk:
+                    f.write(chunk)
+def get_llm():
+    global _LLM
+    if _LLM is not None:
+        return _LLM
+    ensure_model_file()
+    # If CUDA build is present, n_gpu_layers=-1 will push as much as possible to GPU
+    _LLM = Llama(
+        model_path=MODEL_PATH,
+        n_ctx=N_CTX,
+        n_threads=max(2, os.cpu_count() or 4),
+        n_gpu_layers=N_GPU_LAYERS,
+        verbose=False,
+    )
+    return _LLM
+def answer_with_llm(question: str, hits: list):
+    llm = get_llm()
+    sources_text = "\n\n".join([f"[Source {i+1}]\n{h['chunk']}" for i, h in enumerate(hits)])
+    system = (
         "You are a resume assistant.\n"
         "Answer ONLY using the provided SOURCES.\n"
         "If the answer is not explicitly supported by the SOURCES, say: "
         "Keep it concise and professional.\n"
     )
+    prompt = (
+        f"{system}\n\n"
+        f"SOURCES:\n{sources_text}\n\n"
         f"QUESTION:\n{question}\n\n"
         f"ANSWER:"
     )
+    out = llm(
+        prompt,
+        max_tokens=MAX_NEW_TOKENS,
+        temperature=TEMPERATURE,
         top_p=0.9,
+        repeat_penalty=1.05,
+        stop=["\n\nQUESTION:", "\n\nSOURCES:"],
     )
+    text = out["choices"][0]["text"].strip()
+    return text
 # -------------------------
         self.chunks = []
         self.ready = False
 STATE = AppState()
         return status_badge(False, "Could not chunk the resume. Try DOCX."), gr.update(interactive=False), []
     try:
+        embedder = TextEmbedding(model_name=EMBED_MODEL)
         vecs = np.array(list(embedder.embed(chunks)), dtype="float32")
         index = build_faiss_index(vecs)
     except Exception:
     STATE.chunks = chunks
     STATE.ready = True
+    # Warm up LLM lazily later, do not block UI
     return status_badge(True, "Resume loaded. Ask your question below."), gr.update(interactive=True), []
+def on_ask(question, history):
     history = history or []
     q = (question or "").strip()
     if not q:
         return history
     hits = retrieve(q, STATE.embedder, STATE.index, STATE.chunks, top_k=TOP_K)
+    try:
+        answer = answer_with_llm(q, hits)
+    except Exception as e:
+        answer = f"Local model error: {e}"
     final = f"{answer}\n\nSources:\n{format_sources(hits)}"
         <div style="margin-bottom:10px;">
           <div style="font-size:28px;font-weight:900;">ResumeQA</div>
           <div style="opacity:0.82;margin-top:2px;">
+            Upload a resume, then ask questions. Everything runs locally.
           </div>
         </div>
         """
     uploader = gr.File(label="Upload resume (PDF or DOCX)", file_types=[".pdf", ".docx"], height=90)
     build_btn = gr.Button("Build resume index", variant="primary")
     chatbot = gr.Chatbot(label="Chat", height=430)
     with gr.Row():
         question = gr.Textbox(
             label="Your question",
+            placeholder="Example: What roles have I held, and what impact did I deliver?",
             interactive=False
         )
         ask_btn = gr.Button("Ask", variant="primary")
     clear_btn = gr.Button("Clear chat", variant="secondary")
     build_btn.click(fn=on_build, inputs=[uploader], outputs=[status_html, question, chatbot])
+    ask_btn.click(fn=on_ask, inputs=[question, chatbot], outputs=[chatbot]).then(lambda: "", None, question)
+    question.submit(fn=on_ask, inputs=[question, chatbot], outputs=[chatbot]).then(lambda: "", None, question)
     clear_btn.click(fn=on_clear, inputs=None, outputs=[chatbot])
+demo.queue(default_concurrency_limit=4).launch(css=CSS, ssr_mode=False)