telcom commited on
Commit
4a58f2b
·
verified ·
1 Parent(s): 359afce

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +90 -62
app.py CHANGED
@@ -3,24 +3,34 @@ import re
3
  import gradio as gr
4
  import numpy as np
5
  import faiss
 
6
 
7
  from pypdf import PdfReader
8
  from docx import Document
9
  from fastembed import TextEmbedding
10
- from huggingface_hub import InferenceClient
11
 
12
 
13
  # -------------------------
14
  # Config
15
  # -------------------------
16
- HF_TOKEN = os.getenv("HF_TOKEN", "")
17
- DEFAULT_CHAT_MODEL = os.getenv("CHAT_MODEL_ID", "meta-llama/Meta-Llama-3-8B-Instruct")
18
- DEFAULT_EMBED_MODEL = os.getenv("EMBED_MODEL_ID", "BAAI/bge-small-en-v1.5")
19
 
20
  TOP_K = int(os.getenv("TOP_K", "5"))
21
  CHUNK_CHARS = int(os.getenv("CHUNK_CHARS", "1400"))
22
  CHUNK_OVERLAP = int(os.getenv("CHUNK_OVERLAP", "250"))
23
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
  # -------------------------
26
  # Helpers: file -> text
@@ -112,22 +122,64 @@ def retrieve(query: str, embedder: TextEmbedding, index, chunks, top_k: int = TO
112
  return hits
113
 
114
 
 
 
 
 
 
 
 
 
 
 
115
  # -------------------------
116
- # LLM call (HF Inference API)
117
  # -------------------------
118
- def make_client():
119
- if not HF_TOKEN:
120
- return None
121
- return InferenceClient(token=HF_TOKEN)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
 
 
 
123
 
124
- def build_prompt(question: str, contexts: list):
125
- ctx_blocks = []
126
- for i, c in enumerate(contexts, start=1):
127
- ctx_blocks.append(f"[Source {i} | score={c['score']:.3f}]\n{c['chunk']}")
128
- ctx_text = "\n\n".join(ctx_blocks).strip()
129
 
130
- rules = (
131
  "You are a resume assistant.\n"
132
  "Answer ONLY using the provided SOURCES.\n"
133
  "If the answer is not explicitly supported by the SOURCES, say: "
@@ -136,36 +188,24 @@ def build_prompt(question: str, contexts: list):
136
  "Keep it concise and professional.\n"
137
  )
138
 
139
- return (
140
- f"{rules}\n"
141
- f"SOURCES:\n{ctx_text}\n\n"
142
  f"QUESTION:\n{question}\n\n"
143
  f"ANSWER:"
144
  )
145
 
146
-
147
- def generate_answer_hf(client: InferenceClient, model_id: str, prompt: str):
148
- resp = client.text_generation(
149
- model=model_id,
150
- prompt=prompt,
151
- max_new_tokens=320,
152
- temperature=0.2,
153
  top_p=0.9,
154
- repetition_penalty=1.05,
155
- do_sample=True,
156
- return_full_text=False,
157
  )
158
- return (resp or "").strip()
159
 
160
-
161
- def format_sources(hits):
162
- lines = []
163
- for i, h in enumerate(hits, start=1):
164
- snippet = re.sub(r"\s+", " ", h["chunk"].strip())
165
- if len(snippet) > 260:
166
- snippet = snippet[:260] + "..."
167
- lines.append(f"- Source {i} (score {h['score']:.3f}): {snippet}")
168
- return "\n".join(lines)
169
 
170
 
171
  # -------------------------
@@ -178,7 +218,6 @@ class AppState:
178
  self.chunks = []
179
  self.ready = False
180
 
181
-
182
  STATE = AppState()
183
 
184
 
@@ -235,7 +274,7 @@ def on_build(file_obj):
235
  return status_badge(False, "Could not chunk the resume. Try DOCX."), gr.update(interactive=False), []
236
 
237
  try:
238
- embedder = TextEmbedding(model_name=DEFAULT_EMBED_MODEL)
239
  vecs = np.array(list(embedder.embed(chunks)), dtype="float32")
240
  index = build_faiss_index(vecs)
241
  except Exception:
@@ -246,10 +285,11 @@ def on_build(file_obj):
246
  STATE.chunks = chunks
247
  STATE.ready = True
248
 
 
249
  return status_badge(True, "Resume loaded. Ask your question below."), gr.update(interactive=True), []
250
 
251
 
252
- def on_ask(question, history, chat_model_id):
253
  history = history or []
254
  q = (question or "").strip()
255
  if not q:
@@ -261,19 +301,11 @@ def on_ask(question, history, chat_model_id):
261
  return history
262
 
263
  hits = retrieve(q, STATE.embedder, STATE.index, STATE.chunks, top_k=TOP_K)
264
- prompt = build_prompt(q, hits)
265
 
266
- client = make_client()
267
- if client is None:
268
- answer = (
269
- "HF_TOKEN is not set, so I cannot call the chat model.\n\n"
270
- "Add a Space secret named HF_TOKEN, then try again."
271
- )
272
- else:
273
- try:
274
- answer = generate_answer_hf(client, chat_model_id, prompt)
275
- except Exception as e:
276
- answer = f"Model call failed: {e}"
277
 
278
  final = f"{answer}\n\nSources:\n{format_sources(hits)}"
279
 
@@ -295,7 +327,7 @@ with gr.Blocks(title="ResumeQA") as demo:
295
  <div style="margin-bottom:10px;">
296
  <div style="font-size:28px;font-weight:900;">ResumeQA</div>
297
  <div style="opacity:0.82;margin-top:2px;">
298
- Upload a resume, then ask questions. Answers stay grounded in the document.
299
  </div>
300
  </div>
301
  """
@@ -306,25 +338,21 @@ with gr.Blocks(title="ResumeQA") as demo:
306
  uploader = gr.File(label="Upload resume (PDF or DOCX)", file_types=[".pdf", ".docx"], height=90)
307
  build_btn = gr.Button("Build resume index", variant="primary")
308
 
309
- # No type/format args, this build uses messages by default
310
  chatbot = gr.Chatbot(label="Chat", height=430)
311
 
312
  with gr.Row():
313
  question = gr.Textbox(
314
  label="Your question",
315
- placeholder="Example: What are my strongest skills for a Solution Architect role?",
316
  interactive=False
317
  )
318
  ask_btn = gr.Button("Ask", variant="primary")
319
 
320
  clear_btn = gr.Button("Clear chat", variant="secondary")
321
- chat_model = gr.Textbox(value=DEFAULT_CHAT_MODEL, visible=False)
322
 
323
  build_btn.click(fn=on_build, inputs=[uploader], outputs=[status_html, question, chatbot])
324
-
325
- ask_btn.click(fn=on_ask, inputs=[question, chatbot, chat_model], outputs=[chatbot]).then(lambda: "", None, question)
326
- question.submit(fn=on_ask, inputs=[question, chatbot, chat_model], outputs=[chatbot]).then(lambda: "", None, question)
327
-
328
  clear_btn.click(fn=on_clear, inputs=None, outputs=[chatbot])
329
 
330
- demo.queue(default_concurrency_limit=8).launch(css=CSS, ssr_mode=False)
 
3
  import gradio as gr
4
  import numpy as np
5
  import faiss
6
+ import requests
7
 
8
  from pypdf import PdfReader
9
  from docx import Document
10
  from fastembed import TextEmbedding
11
+ from llama_cpp import Llama
12
 
13
 
14
  # -------------------------
15
  # Config
16
  # -------------------------
17
+ EMBED_MODEL = os.getenv("EMBED_MODEL_ID", "BAAI/bge-small-en-v1.5")
 
 
18
 
19
  TOP_K = int(os.getenv("TOP_K", "5"))
20
  CHUNK_CHARS = int(os.getenv("CHUNK_CHARS", "1400"))
21
  CHUNK_OVERLAP = int(os.getenv("CHUNK_OVERLAP", "250"))
22
 
23
+ MAX_NEW_TOKENS = int(os.getenv("MAX_NEW_TOKENS", "260"))
24
+ TEMPERATURE = float(os.getenv("TEMPERATURE", "0.2"))
25
+
26
+ # GGUF model path and optional public download URL
27
+ MODEL_PATH = os.getenv("GGUF_MODEL_PATH", "models/model.gguf")
28
+ MODEL_URL = os.getenv("GGUF_MODEL_URL", "") # optional, public direct link to a .gguf
29
+
30
+ # GPU layers: -1 means "as many as possible"
31
+ N_GPU_LAYERS = int(os.getenv("N_GPU_LAYERS", "-1"))
32
+ N_CTX = int(os.getenv("N_CTX", "4096"))
33
+
34
 
35
  # -------------------------
36
  # Helpers: file -> text
 
122
  return hits
123
 
124
 
125
+ def format_sources(hits):
126
+ lines = []
127
+ for i, h in enumerate(hits, start=1):
128
+ snippet = re.sub(r"\s+", " ", h["chunk"].strip())
129
+ if len(snippet) > 220:
130
+ snippet = snippet[:220] + "..."
131
+ lines.append(f"- Source {i} (score {h['score']:.3f}): {snippet}")
132
+ return "\n".join(lines)
133
+
134
+
135
  # -------------------------
136
+ # Local LLM (llama.cpp)
137
  # -------------------------
138
+ _LLM = None
139
+
140
+ def ensure_model_file():
141
+ os.makedirs(os.path.dirname(MODEL_PATH) or ".", exist_ok=True)
142
+ if os.path.exists(MODEL_PATH) and os.path.getsize(MODEL_PATH) > 10_000_000:
143
+ return
144
+
145
+ if not MODEL_URL:
146
+ raise RuntimeError(
147
+ "GGUF model file not found. Set GGUF_MODEL_PATH to an existing .gguf in the repo, "
148
+ "or provide GGUF_MODEL_URL (public direct link to a .gguf)."
149
+ )
150
+
151
+ # Download the model once
152
+ with requests.get(MODEL_URL, stream=True, timeout=120) as r:
153
+ r.raise_for_status()
154
+ with open(MODEL_PATH, "wb") as f:
155
+ for chunk in r.iter_content(chunk_size=1024 * 1024):
156
+ if chunk:
157
+ f.write(chunk)
158
+
159
+ def get_llm():
160
+ global _LLM
161
+ if _LLM is not None:
162
+ return _LLM
163
+
164
+ ensure_model_file()
165
+
166
+ # If CUDA build is present, n_gpu_layers=-1 will push as much as possible to GPU
167
+ _LLM = Llama(
168
+ model_path=MODEL_PATH,
169
+ n_ctx=N_CTX,
170
+ n_threads=max(2, os.cpu_count() or 4),
171
+ n_gpu_layers=N_GPU_LAYERS,
172
+ verbose=False,
173
+ )
174
+ return _LLM
175
+
176
 
177
+ def answer_with_llm(question: str, hits: list):
178
+ llm = get_llm()
179
 
180
+ sources_text = "\n\n".join([f"[Source {i+1}]\n{h['chunk']}" for i, h in enumerate(hits)])
 
 
 
 
181
 
182
+ system = (
183
  "You are a resume assistant.\n"
184
  "Answer ONLY using the provided SOURCES.\n"
185
  "If the answer is not explicitly supported by the SOURCES, say: "
 
188
  "Keep it concise and professional.\n"
189
  )
190
 
191
+ prompt = (
192
+ f"{system}\n\n"
193
+ f"SOURCES:\n{sources_text}\n\n"
194
  f"QUESTION:\n{question}\n\n"
195
  f"ANSWER:"
196
  )
197
 
198
+ out = llm(
199
+ prompt,
200
+ max_tokens=MAX_NEW_TOKENS,
201
+ temperature=TEMPERATURE,
 
 
 
202
  top_p=0.9,
203
+ repeat_penalty=1.05,
204
+ stop=["\n\nQUESTION:", "\n\nSOURCES:"],
 
205
  )
 
206
 
207
+ text = out["choices"][0]["text"].strip()
208
+ return text
 
 
 
 
 
 
 
209
 
210
 
211
  # -------------------------
 
218
  self.chunks = []
219
  self.ready = False
220
 
 
221
  STATE = AppState()
222
 
223
 
 
274
  return status_badge(False, "Could not chunk the resume. Try DOCX."), gr.update(interactive=False), []
275
 
276
  try:
277
+ embedder = TextEmbedding(model_name=EMBED_MODEL)
278
  vecs = np.array(list(embedder.embed(chunks)), dtype="float32")
279
  index = build_faiss_index(vecs)
280
  except Exception:
 
285
  STATE.chunks = chunks
286
  STATE.ready = True
287
 
288
+ # Warm up LLM lazily later, do not block UI
289
  return status_badge(True, "Resume loaded. Ask your question below."), gr.update(interactive=True), []
290
 
291
 
292
+ def on_ask(question, history):
293
  history = history or []
294
  q = (question or "").strip()
295
  if not q:
 
301
  return history
302
 
303
  hits = retrieve(q, STATE.embedder, STATE.index, STATE.chunks, top_k=TOP_K)
 
304
 
305
+ try:
306
+ answer = answer_with_llm(q, hits)
307
+ except Exception as e:
308
+ answer = f"Local model error: {e}"
 
 
 
 
 
 
 
309
 
310
  final = f"{answer}\n\nSources:\n{format_sources(hits)}"
311
 
 
327
  <div style="margin-bottom:10px;">
328
  <div style="font-size:28px;font-weight:900;">ResumeQA</div>
329
  <div style="opacity:0.82;margin-top:2px;">
330
+ Upload a resume, then ask questions. Everything runs locally.
331
  </div>
332
  </div>
333
  """
 
338
  uploader = gr.File(label="Upload resume (PDF or DOCX)", file_types=[".pdf", ".docx"], height=90)
339
  build_btn = gr.Button("Build resume index", variant="primary")
340
 
 
341
  chatbot = gr.Chatbot(label="Chat", height=430)
342
 
343
  with gr.Row():
344
  question = gr.Textbox(
345
  label="Your question",
346
+ placeholder="Example: What roles have I held, and what impact did I deliver?",
347
  interactive=False
348
  )
349
  ask_btn = gr.Button("Ask", variant="primary")
350
 
351
  clear_btn = gr.Button("Clear chat", variant="secondary")
 
352
 
353
  build_btn.click(fn=on_build, inputs=[uploader], outputs=[status_html, question, chatbot])
354
+ ask_btn.click(fn=on_ask, inputs=[question, chatbot], outputs=[chatbot]).then(lambda: "", None, question)
355
+ question.submit(fn=on_ask, inputs=[question, chatbot], outputs=[chatbot]).then(lambda: "", None, question)
 
 
356
  clear_btn.click(fn=on_clear, inputs=None, outputs=[chatbot])
357
 
358
+ demo.queue(default_concurrency_limit=4).launch(css=CSS, ssr_mode=False)