telcom commited on
Commit
564dfdd
·
verified ·
1 Parent(s): 4421834

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +351 -0
app.py ADDED
@@ -0,0 +1,351 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import json
4
+ import gradio as gr
5
+ import numpy as np
6
+ import faiss
7
+
8
+ from pypdf import PdfReader
9
+ from docx import Document
10
+ from sentence_transformers import SentenceTransformer
11
+ from huggingface_hub import InferenceClient
12
+
13
+
14
+ # -------------------------
15
+ # Config
16
+ # -------------------------
17
+ DEFAULT_EMBED_MODEL = os.getenv("EMBED_MODEL_ID", "BAAI/bge-small-en-v1.5")
18
+ DEFAULT_CHAT_MODEL = os.getenv("CHAT_MODEL_ID", "meta-llama/Meta-Llama-3-8B-Instruct")
19
+ HF_TOKEN = os.getenv("HF_TOKEN", "")
20
+
21
+ # Retrieval settings
22
+ TOP_K = int(os.getenv("TOP_K", "5"))
23
+ CHUNK_CHARS = int(os.getenv("CHUNK_CHARS", "1400"))
24
+ CHUNK_OVERLAP = int(os.getenv("CHUNK_OVERLAP", "250"))
25
+
26
+ # Safety / grounding
27
+ STRICT_GROUNDED = True
28
+
29
+
30
+ # -------------------------
31
+ # Helpers: file -> text
32
+ # -------------------------
33
+ def _clean_text(s: str) -> str:
34
+ s = s.replace("\x00", " ")
35
+ s = re.sub(r"[ \t]+", " ", s)
36
+ s = re.sub(r"\n{3,}", "\n\n", s)
37
+ return s.strip()
38
+
39
+
40
+ def extract_text_from_pdf(path: str) -> str:
41
+ reader = PdfReader(path)
42
+ parts = []
43
+ for page in reader.pages:
44
+ txt = page.extract_text() or ""
45
+ if txt.strip():
46
+ parts.append(txt)
47
+ return _clean_text("\n\n".join(parts))
48
+
49
+
50
+ def extract_text_from_docx(path: str) -> str:
51
+ doc = Document(path)
52
+ parts = []
53
+ for p in doc.paragraphs:
54
+ t = (p.text or "").strip()
55
+ if t:
56
+ parts.append(t)
57
+ return _clean_text("\n".join(parts))
58
+
59
+
60
+ def extract_resume_text(file_path: str) -> str:
61
+ lower = file_path.lower()
62
+ if lower.endswith(".pdf"):
63
+ return extract_text_from_pdf(file_path)
64
+ if lower.endswith(".docx"):
65
+ return extract_text_from_docx(file_path)
66
+ raise ValueError("Unsupported file type. Please upload a PDF or DOCX.")
67
+
68
+
69
+ # -------------------------
70
+ # Chunking
71
+ # -------------------------
72
+ def chunk_text(text: str, chunk_chars: int = CHUNK_CHARS, overlap: int = CHUNK_OVERLAP):
73
+ """
74
+ Simple character-based chunking with overlap.
75
+ Works well enough for resumes and is robust to formatting.
76
+ """
77
+ text = text.strip()
78
+ if not text:
79
+ return []
80
+
81
+ chunks = []
82
+ start = 0
83
+ n = len(text)
84
+
85
+ while start < n:
86
+ end = min(start + chunk_chars, n)
87
+ chunk = text[start:end].strip()
88
+ if chunk:
89
+ chunks.append(chunk)
90
+ if end == n:
91
+ break
92
+ start = max(0, end - overlap)
93
+
94
+ return chunks
95
+
96
+
97
+ # -------------------------
98
+ # Vector store (FAISS)
99
+ # -------------------------
100
+ def normalize(v: np.ndarray) -> np.ndarray:
101
+ norm = np.linalg.norm(v, axis=1, keepdims=True) + 1e-12
102
+ return v / norm
103
+
104
+
105
+ def build_faiss_index(embeddings: np.ndarray):
106
+ """
107
+ Cosine similarity via inner product on normalized vectors.
108
+ """
109
+ embeddings = normalize(embeddings.astype("float32"))
110
+ dim = embeddings.shape[1]
111
+ index = faiss.IndexFlatIP(dim)
112
+ index.add(embeddings)
113
+ return index, embeddings
114
+
115
+
116
+ def retrieve(query: str, embedder: SentenceTransformer, index, chunks, top_k: int = TOP_K):
117
+ q_emb = embedder.encode([query], convert_to_numpy=True).astype("float32")
118
+ q_emb = normalize(q_emb)
119
+ scores, ids = index.search(q_emb, top_k)
120
+ hits = []
121
+ for score, idx in zip(scores[0], ids[0]):
122
+ if idx == -1:
123
+ continue
124
+ hits.append({"score": float(score), "chunk": chunks[int(idx)], "id": int(idx)})
125
+ return hits
126
+
127
+
128
+ # -------------------------
129
+ # LLM call (HF Inference API)
130
+ # -------------------------
131
+ def make_client():
132
+ if not HF_TOKEN:
133
+ return None
134
+ return InferenceClient(token=HF_TOKEN)
135
+
136
+
137
+ def build_prompt(question: str, contexts: list):
138
+ """
139
+ Contexts is list of dicts with keys: chunk, id, score
140
+ """
141
+ ctx_blocks = []
142
+ for i, c in enumerate(contexts, start=1):
143
+ ctx_blocks.append(f"[Source {i} | chunk_id={c['id']} | score={c['score']:.3f}]\n{c['chunk']}")
144
+
145
+ ctx_text = "\n\n".join(ctx_blocks).strip()
146
+
147
+ system_rules = (
148
+ "You are a resume assistant. Answer ONLY using the provided SOURCES.\n"
149
+ "If the answer is not explicitly supported by the SOURCES, say: "
150
+ "'I cannot find that in the uploaded resume.'\n"
151
+ "Do not invent roles, dates, skills, employers, or achievements.\n"
152
+ "Be concise and professional.\n"
153
+ )
154
+
155
+ prompt = (
156
+ f"{system_rules}\n"
157
+ f"SOURCES:\n{ctx_text}\n\n"
158
+ f"QUESTION:\n{question}\n\n"
159
+ f"ANSWER (with short bullet points if helpful):"
160
+ )
161
+ return prompt
162
+
163
+
164
+ def generate_answer_hf(client: InferenceClient, model_id: str, prompt: str):
165
+ """
166
+ Uses text generation endpoint. Works for most instruct models hosted by HF Inference.
167
+ """
168
+ # Conservative defaults to reduce rambling
169
+ resp = client.text_generation(
170
+ model=model_id,
171
+ prompt=prompt,
172
+ max_new_tokens=350,
173
+ temperature=0.2,
174
+ top_p=0.9,
175
+ repetition_penalty=1.05,
176
+ do_sample=True,
177
+ return_full_text=False,
178
+ )
179
+ return (resp or "").strip()
180
+
181
+
182
+ # -------------------------
183
+ # App state
184
+ # -------------------------
185
+ class AppState:
186
+ def __init__(self):
187
+ self.embedder = None
188
+ self.index = None
189
+ self.chunks = []
190
+ self.resume_text = ""
191
+ self.embed_model_id = DEFAULT_EMBED_MODEL
192
+
193
+ def ready(self):
194
+ return self.index is not None and len(self.chunks) > 0
195
+
196
+
197
+ STATE = AppState()
198
+
199
+
200
+ def load_embedder(model_id: str):
201
+ # Cached by SentenceTransformer internally after first load
202
+ return SentenceTransformer(model_id)
203
+
204
+
205
+ # -------------------------
206
+ # Gradio callbacks
207
+ # -------------------------
208
+ def handle_upload(file_obj, embed_model_id):
209
+ if file_obj is None:
210
+ return "No file uploaded.", "", None
211
+
212
+ path = file_obj.name
213
+ try:
214
+ text = extract_resume_text(path)
215
+ except Exception as e:
216
+ return f"Failed to read file: {e}", "", None
217
+
218
+ if not text.strip():
219
+ return "Uploaded file has no extractable text. Try a different PDF (not scanned) or upload DOCX.", "", None
220
+
221
+ chunks = chunk_text(text)
222
+ if len(chunks) < 2:
223
+ # Still fine, but warn
224
+ pass
225
+
226
+ # Load embedder
227
+ try:
228
+ embedder = load_embedder(embed_model_id)
229
+ except Exception as e:
230
+ return f"Failed to load embedding model '{embed_model_id}': {e}", "", None
231
+
232
+ # Embed and index
233
+ try:
234
+ embs = embedder.encode(chunks, convert_to_numpy=True, show_progress_bar=False)
235
+ index, _ = build_faiss_index(embs)
236
+ except Exception as e:
237
+ return f"Failed to embed and index: {e}", "", None
238
+
239
+ # Save state
240
+ STATE.embedder = embedder
241
+ STATE.index = index
242
+ STATE.chunks = chunks
243
+ STATE.resume_text = text
244
+ STATE.embed_model_id = embed_model_id
245
+
246
+ preview = text[:2000] + ("\n\n... (truncated preview)" if len(text) > 2000 else "")
247
+ status = f"Resume loaded. Extracted {len(text)} characters, created {len(chunks)} chunks, FAISS index ready."
248
+ return status, preview, []
249
+
250
+
251
+ def answer_question(message, history, chat_model_id):
252
+ if not STATE.ready():
253
+ history = history or []
254
+ history.append((message, "Please upload a resume first (PDF or DOCX)."))
255
+ return history
256
+
257
+ q = (message or "").strip()
258
+ if not q:
259
+ return history
260
+
261
+ hits = retrieve(q, STATE.embedder, STATE.index, STATE.chunks, top_k=TOP_K)
262
+
263
+ # Build sources display
264
+ sources_md = []
265
+ for i, h in enumerate(hits, start=1):
266
+ snippet = h["chunk"]
267
+ if len(snippet) > 550:
268
+ snippet = snippet[:550] + "..."
269
+ sources_md.append(f"**Source {i}** (score {h['score']:.3f})\n\n{snippet}")
270
+
271
+ prompt = build_prompt(q, hits)
272
+
273
+ client = make_client()
274
+ if client is None:
275
+ answer = (
276
+ "HF_TOKEN is not set, so I cannot call a chat model.\n\n"
277
+ "Set a Space secret named HF_TOKEN (your Hugging Face access token), "
278
+ "then ask again."
279
+ )
280
+ else:
281
+ try:
282
+ answer = generate_answer_hf(client, chat_model_id, prompt)
283
+ except Exception as e:
284
+ answer = f"Model call failed: {e}"
285
+
286
+ # Append citations section
287
+ full_answer = f"{answer}\n\n---\n### Sources\n" + "\n\n".join(sources_md)
288
+
289
+ history = history or []
290
+ history.append((q, full_answer))
291
+ return history
292
+
293
+
294
+ # -------------------------
295
+ # UI
296
+ # -------------------------
297
+ with gr.Blocks(title="Resume Q&A (RAG)") as demo:
298
+ gr.Markdown(
299
+ "# Resume Q&A (Grounded)\n"
300
+ "Upload your resume (PDF or DOCX). Then ask questions. Answers are grounded in retrieved sources.\n\n"
301
+ "Tips:\n"
302
+ "- If your PDF is scanned (image-only), text extraction may fail. Prefer DOCX or a text-based PDF.\n"
303
+ "- Add HF_TOKEN as a Space secret to enable the chat model call.\n"
304
+ )
305
+
306
+ with gr.Row():
307
+ embed_model = gr.Textbox(
308
+ label="Embedding model (SentenceTransformers)",
309
+ value=DEFAULT_EMBED_MODEL,
310
+ info="Default is fast and strong for retrieval."
311
+ )
312
+ chat_model = gr.Textbox(
313
+ label="Chat model (HF Inference model id)",
314
+ value=DEFAULT_CHAT_MODEL,
315
+ info="Used via Hugging Face Inference API. Requires HF_TOKEN secret."
316
+ )
317
+
318
+ with gr.Row():
319
+ uploader = gr.File(label="Upload resume (PDF or DOCX)", file_types=[".pdf", ".docx"])
320
+ upload_btn = gr.Button("Build index")
321
+
322
+ status = gr.Textbox(label="Status", interactive=False)
323
+ preview = gr.Textbox(label="Extracted text preview", lines=12, interactive=False)
324
+
325
+ gr.Markdown("## Chat")
326
+ chatbot = gr.Chatbot(height=420)
327
+ msg = gr.Textbox(label="Ask about the resume", placeholder="Example: What companies did I work at and what were my responsibilities?")
328
+ send = gr.Button("Send")
329
+ clear = gr.Button("Clear chat")
330
+
331
+ upload_btn.click(
332
+ fn=handle_upload,
333
+ inputs=[uploader, embed_model],
334
+ outputs=[status, preview, chatbot]
335
+ )
336
+
337
+ send.click(
338
+ fn=answer_question,
339
+ inputs=[msg, chatbot, chat_model],
340
+ outputs=[chatbot]
341
+ ).then(lambda: "", None, msg)
342
+
343
+ msg.submit(
344
+ fn=answer_question,
345
+ inputs=[msg, chatbot, chat_model],
346
+ outputs=[chatbot]
347
+ ).then(lambda: "", None, msg)
348
+
349
+ clear.click(lambda: [], None, chatbot)
350
+
351
+ demo.launch()