Spaces:

telcom
/

ResumeQA

Sleeping

App Files Files Community

telcom commited on Jan 19

Commit

564dfdd

verified ·

1 Parent(s): 4421834

Create app.py

Browse files

Files changed (1) hide show

app.py +351 -0

app.py ADDED Viewed

	@@ -0,0 +1,351 @@

+import os
+import re
+import json
+import gradio as gr
+import numpy as np
+import faiss
+from pypdf import PdfReader
+from docx import Document
+from sentence_transformers import SentenceTransformer
+from huggingface_hub import InferenceClient
+# -------------------------
+# Config
+# -------------------------
+DEFAULT_EMBED_MODEL = os.getenv("EMBED_MODEL_ID", "BAAI/bge-small-en-v1.5")
+DEFAULT_CHAT_MODEL = os.getenv("CHAT_MODEL_ID", "meta-llama/Meta-Llama-3-8B-Instruct")
+HF_TOKEN = os.getenv("HF_TOKEN", "")
+# Retrieval settings
+TOP_K = int(os.getenv("TOP_K", "5"))
+CHUNK_CHARS = int(os.getenv("CHUNK_CHARS", "1400"))
+CHUNK_OVERLAP = int(os.getenv("CHUNK_OVERLAP", "250"))
+# Safety / grounding
+STRICT_GROUNDED = True
+# -------------------------
+# Helpers: file -> text
+# -------------------------
+def _clean_text(s: str) -> str:
+    s = s.replace("\x00", " ")
+    s = re.sub(r"[ \t]+", " ", s)
+    s = re.sub(r"\n{3,}", "\n\n", s)
+    return s.strip()
+def extract_text_from_pdf(path: str) -> str:
+    reader = PdfReader(path)
+    parts = []
+    for page in reader.pages:
+        txt = page.extract_text() or ""
+        if txt.strip():
+            parts.append(txt)
+    return _clean_text("\n\n".join(parts))
+def extract_text_from_docx(path: str) -> str:
+    doc = Document(path)
+    parts = []
+    for p in doc.paragraphs:
+        t = (p.text or "").strip()
+        if t:
+            parts.append(t)
+    return _clean_text("\n".join(parts))
+def extract_resume_text(file_path: str) -> str:
+    lower = file_path.lower()
+    if lower.endswith(".pdf"):
+        return extract_text_from_pdf(file_path)
+    if lower.endswith(".docx"):
+        return extract_text_from_docx(file_path)
+    raise ValueError("Unsupported file type. Please upload a PDF or DOCX.")
+# -------------------------
+# Chunking
+# -------------------------
+def chunk_text(text: str, chunk_chars: int = CHUNK_CHARS, overlap: int = CHUNK_OVERLAP):
+    """
+    Simple character-based chunking with overlap.
+    Works well enough for resumes and is robust to formatting.
+    """
+    text = text.strip()
+    if not text:
+        return []
+    chunks = []
+    start = 0
+    n = len(text)
+    while start < n:
+        end = min(start + chunk_chars, n)
+        chunk = text[start:end].strip()
+        if chunk:
+            chunks.append(chunk)
+        if end == n:
+            break
+        start = max(0, end - overlap)
+    return chunks
+# -------------------------
+# Vector store (FAISS)
+# -------------------------
+def normalize(v: np.ndarray) -> np.ndarray:
+    norm = np.linalg.norm(v, axis=1, keepdims=True) + 1e-12
+    return v / norm
+def build_faiss_index(embeddings: np.ndarray):
+    """
+    Cosine similarity via inner product on normalized vectors.
+    """
+    embeddings = normalize(embeddings.astype("float32"))
+    dim = embeddings.shape[1]
+    index = faiss.IndexFlatIP(dim)
+    index.add(embeddings)
+    return index, embeddings
+def retrieve(query: str, embedder: SentenceTransformer, index, chunks, top_k: int = TOP_K):
+    q_emb = embedder.encode([query], convert_to_numpy=True).astype("float32")
+    q_emb = normalize(q_emb)
+    scores, ids = index.search(q_emb, top_k)
+    hits = []
+    for score, idx in zip(scores[0], ids[0]):
+        if idx == -1:
+            continue
+        hits.append({"score": float(score), "chunk": chunks[int(idx)], "id": int(idx)})
+    return hits
+# -------------------------
+# LLM call (HF Inference API)
+# -------------------------
+def make_client():
+    if not HF_TOKEN:
+        return None
+    return InferenceClient(token=HF_TOKEN)
+def build_prompt(question: str, contexts: list):
+    """
+    Contexts is list of dicts with keys: chunk, id, score
+    """
+    ctx_blocks = []
+    for i, c in enumerate(contexts, start=1):
+        ctx_blocks.append(f"[Source {i} | chunk_id={c['id']} | score={c['score']:.3f}]\n{c['chunk']}")
+    ctx_text = "\n\n".join(ctx_blocks).strip()
+    system_rules = (
+        "You are a resume assistant. Answer ONLY using the provided SOURCES.\n"
+        "If the answer is not explicitly supported by the SOURCES, say: "
+        "'I cannot find that in the uploaded resume.'\n"
+        "Do not invent roles, dates, skills, employers, or achievements.\n"
+        "Be concise and professional.\n"
+    )
+    prompt = (
+        f"{system_rules}\n"
+        f"SOURCES:\n{ctx_text}\n\n"
+        f"QUESTION:\n{question}\n\n"
+        f"ANSWER (with short bullet points if helpful):"
+    )
+    return prompt
+def generate_answer_hf(client: InferenceClient, model_id: str, prompt: str):
+    """
+    Uses text generation endpoint. Works for most instruct models hosted by HF Inference.
+    """
+    # Conservative defaults to reduce rambling
+    resp = client.text_generation(
+        model=model_id,
+        prompt=prompt,
+        max_new_tokens=350,
+        temperature=0.2,
+        top_p=0.9,
+        repetition_penalty=1.05,
+        do_sample=True,
+        return_full_text=False,
+    )
+    return (resp or "").strip()
+# -------------------------
+# App state
+# -------------------------
+class AppState:
+    def __init__(self):
+        self.embedder = None
+        self.index = None
+        self.chunks = []
+        self.resume_text = ""
+        self.embed_model_id = DEFAULT_EMBED_MODEL
+    def ready(self):
+        return self.index is not None and len(self.chunks) > 0
+STATE = AppState()
+def load_embedder(model_id: str):
+    # Cached by SentenceTransformer internally after first load
+    return SentenceTransformer(model_id)
+# -------------------------
+# Gradio callbacks
+# -------------------------
+def handle_upload(file_obj, embed_model_id):
+    if file_obj is None:
+        return "No file uploaded.", "", None
+    path = file_obj.name
+    try:
+        text = extract_resume_text(path)
+    except Exception as e:
+        return f"Failed to read file: {e}", "", None
+    if not text.strip():
+        return "Uploaded file has no extractable text. Try a different PDF (not scanned) or upload DOCX.", "", None
+    chunks = chunk_text(text)
+    if len(chunks) < 2:
+        # Still fine, but warn
+        pass
+    # Load embedder
+    try:
+        embedder = load_embedder(embed_model_id)
+    except Exception as e:
+        return f"Failed to load embedding model '{embed_model_id}': {e}", "", None
+    # Embed and index
+    try:
+        embs = embedder.encode(chunks, convert_to_numpy=True, show_progress_bar=False)
+        index, _ = build_faiss_index(embs)
+    except Exception as e:
+        return f"Failed to embed and index: {e}", "", None
+    # Save state
+    STATE.embedder = embedder
+    STATE.index = index
+    STATE.chunks = chunks
+    STATE.resume_text = text
+    STATE.embed_model_id = embed_model_id
+    preview = text[:2000] + ("\n\n... (truncated preview)" if len(text) > 2000 else "")
+    status = f"Resume loaded. Extracted {len(text)} characters, created {len(chunks)} chunks, FAISS index ready."
+    return status, preview, []
+def answer_question(message, history, chat_model_id):
+    if not STATE.ready():
+        history = history or []
+        history.append((message, "Please upload a resume first (PDF or DOCX)."))
+        return history
+    q = (message or "").strip()
+    if not q:
+        return history
+    hits = retrieve(q, STATE.embedder, STATE.index, STATE.chunks, top_k=TOP_K)
+    # Build sources display
+    sources_md = []
+    for i, h in enumerate(hits, start=1):
+        snippet = h["chunk"]
+        if len(snippet) > 550:
+            snippet = snippet[:550] + "..."
+        sources_md.append(f"**Source {i}** (score {h['score']:.3f})\n\n{snippet}")
+    prompt = build_prompt(q, hits)
+    client = make_client()
+    if client is None:
+        answer = (
+            "HF_TOKEN is not set, so I cannot call a chat model.\n\n"
+            "Set a Space secret named HF_TOKEN (your Hugging Face access token), "
+            "then ask again."
+        )
+    else:
+        try:
+            answer = generate_answer_hf(client, chat_model_id, prompt)
+        except Exception as e:
+            answer = f"Model call failed: {e}"
+    # Append citations section
+    full_answer = f"{answer}\n\n---\n### Sources\n" + "\n\n".join(sources_md)
+    history = history or []
+    history.append((q, full_answer))
+    return history
+# -------------------------
+# UI
+# -------------------------
+with gr.Blocks(title="Resume Q&A (RAG)") as demo:
+    gr.Markdown(
+        "# Resume Q&A (Grounded)\n"
+        "Upload your resume (PDF or DOCX). Then ask questions. Answers are grounded in retrieved sources.\n\n"
+        "Tips:\n"
+        "- If your PDF is scanned (image-only), text extraction may fail. Prefer DOCX or a text-based PDF.\n"
+        "- Add HF_TOKEN as a Space secret to enable the chat model call.\n"
+    )
+    with gr.Row():
+        embed_model = gr.Textbox(
+            label="Embedding model (SentenceTransformers)",
+            value=DEFAULT_EMBED_MODEL,
+            info="Default is fast and strong for retrieval."
+        )
+        chat_model = gr.Textbox(
+            label="Chat model (HF Inference model id)",
+            value=DEFAULT_CHAT_MODEL,
+            info="Used via Hugging Face Inference API. Requires HF_TOKEN secret."
+        )
+    with gr.Row():
+        uploader = gr.File(label="Upload resume (PDF or DOCX)", file_types=[".pdf", ".docx"])
+        upload_btn = gr.Button("Build index")
+    status = gr.Textbox(label="Status", interactive=False)
+    preview = gr.Textbox(label="Extracted text preview", lines=12, interactive=False)
+    gr.Markdown("## Chat")
+    chatbot = gr.Chatbot(height=420)
+    msg = gr.Textbox(label="Ask about the resume", placeholder="Example: What companies did I work at and what were my responsibilities?")
+    send = gr.Button("Send")
+    clear = gr.Button("Clear chat")
+    upload_btn.click(
+        fn=handle_upload,
+        inputs=[uploader, embed_model],
+        outputs=[status, preview, chatbot]
+    )
+    send.click(
+        fn=answer_question,
+        inputs=[msg, chatbot, chat_model],
+        outputs=[chatbot]
+    ).then(lambda: "", None, msg)
+    msg.submit(
+        fn=answer_question,
+        inputs=[msg, chatbot, chat_model],
+        outputs=[chatbot]
+    ).then(lambda: "", None, msg)
+    clear.click(lambda: [], None, chatbot)
+demo.launch()