resberry commited on
Commit
815207d
·
verified ·
1 Parent(s): 639c66a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +52 -15
app.py CHANGED
@@ -18,7 +18,7 @@ import numpy as np
18
  import wikipedia
19
 
20
  # === Configuration ===
21
- HF_TOKEN = os.getenv("HF")
22
  MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct"
23
  client = InferenceClient(model=MODEL_NAME, token=HF_TOKEN)
24
 
@@ -125,7 +125,7 @@ def fetch_text(url):
125
  soup = BeautifulSoup(doc.summary(), "html.parser")
126
  text = " ".join(p.get_text() for p in soup.find_all("p")).strip()
127
  return text, url
128
- except:
129
  return "", url
130
 
131
  def scrape_and_save(query):
@@ -204,13 +204,18 @@ def answer_from_context(question):
204
 
205
  prompt = f"""
206
  Today's date is {datetime.utcnow().date()}.
207
- Use context and memory to answer and summarize the following question using fully finished lines, clear, and grammatically correct finished sentences. Ensure the response is factually accurate, complete, well-organized, and easy to understand. Avoid repeating information, unfinished sentences, and keep it concise but informative.
 
208
  [CONTEXT]
209
  {context}
 
210
  [MEMORY]
211
  {memory_prompt}
 
212
  [QUESTION]
 
213
  {question}
 
214
  [ANSWER]
215
  """
216
  try:
@@ -234,25 +239,29 @@ Use context and memory to answer and summarize the following question using full
234
  def needs_web_search_llm(question):
235
  prompt = f"""
236
  You are a helpful assistant that classifies whether a question requires a web search or external data.
 
237
  Question: "{question}"
 
238
  Answer with only "YES" if a web search is needed or "NO" if not.
239
  """
240
  try:
241
  response = client.text_generation(prompt, max_new_tokens=10)
242
  return "YES" in response.strip().upper()
243
- except Exception:
244
  return False
245
 
246
  def is_general_knowledge_question(question):
247
  prompt = f"""
248
  You are a classifier. Determine if the question below can be answered using general world knowledge, like an encyclopedia or Wikipedia.
 
249
  Question: "{question}"
 
250
  Answer with "YES" if it is general knowledge. Otherwise answer "NO".
251
  """
252
  try:
253
  response = client.text_generation(prompt, max_new_tokens=10)
254
  return "YES" in response.strip().upper()
255
- except Exception:
256
  return False
257
 
258
  def get_wikipedia_summary(query, sentences=3):
@@ -263,9 +272,10 @@ def get_wikipedia_summary(query, sentences=3):
263
  return f"Ambiguous question. Possible topics: {', '.join(e.options[:5])}"
264
  except wikipedia.exceptions.PageError:
265
  return "No Wikipedia article found for that topic."
266
- except Exception:
267
  return "Error accessing Wikipedia."
268
 
 
269
  def semantic_scholar_search(query, max_results=5):
270
  params = {
271
  "query": query,
@@ -277,7 +287,8 @@ def semantic_scholar_search(query, max_results=5):
277
  resp.raise_for_status()
278
  data = resp.json()
279
  papers = data.get("data", [])
280
- texts, urls = [], []
 
281
  for p in papers:
282
  title = p.get("title", "")
283
  abstract = p.get("abstract", "")
@@ -299,38 +310,64 @@ def semantic_scholar_search(query, max_results=5):
299
  return "", []
300
 
301
  def is_research_question(question):
302
- research_keywords = ["study", "paper", "evidence", "method", "experiment", "data", "results", "findings", "theory", "analysis"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
303
  q_lower = question.lower()
304
- return any(kw in q_lower for kw in research_keywords)
305
 
306
  def ask(q):
 
307
  if is_research_question(q):
308
  context, sources = semantic_scholar_search(q)
309
  if context:
310
  answer, sources, _ = answer_from_context(q)
311
- return answer, "\n".join(f"- {url}" for url in sources)
 
 
312
  context, sources = scrape_and_save(q)
313
  answer, sources, _ = answer_from_context(q)
314
- return answer, "\n".join(f"- {url}" for url in sources)
 
315
 
 
316
  if is_general_knowledge_question(q):
317
  return get_wikipedia_summary(q), "Source: Wikipedia"
318
 
 
319
  _, _, avg_sim = retrieve_context_from_chunks(q)
 
 
320
  intent_search = needs_web_search_llm(q)
321
 
322
  if intent_search or avg_sim < MIN_CONTEXT_SIMILARITY:
323
  context, sources = scrape_and_save(q)
324
  answer, sources, _ = answer_from_context(q)
325
- return answer, "\n".join(f"- {url}" for url in sources)
326
  else:
327
- prompt = f"<|user|>\n{q.strip()}\n<|assistant|>\n"
 
328
  try:
329
  response = client.text_generation(prompt, max_new_tokens=512)
330
  answer = response.strip().split("<|assistant|>")[-1].strip()
331
  except Exception as e:
332
  answer = f"Error: {e}"
333
- return answer, ""
 
 
334
 
335
  # === Gradio UI ===
336
  with gr.Blocks() as demo:
@@ -353,4 +390,4 @@ if __name__ == '__main__':
353
  question = " ".join(sys.argv[1:])
354
  print(ask(question))
355
  else:
356
- demo.launch()
 
18
  import wikipedia
19
 
20
  # === Configuration ===
21
+ HF_TOKEN = os.getenv("HF")
22
  MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct"
23
  client = InferenceClient(model=MODEL_NAME, token=HF_TOKEN)
24
 
 
125
  soup = BeautifulSoup(doc.summary(), "html.parser")
126
  text = " ".join(p.get_text() for p in soup.find_all("p")).strip()
127
  return text, url
128
+ except Exception as e:
129
  return "", url
130
 
131
  def scrape_and_save(query):
 
204
 
205
  prompt = f"""
206
  Today's date is {datetime.utcnow().date()}.
207
+ Use context and memory to answer and summarize the following question using fullly finished lines end with., clear, and grammatically correct finish sentences. Ensure that the response is factually accurate, complete, well-organized, finish sentences and easy to understand. Avoid repeating information,unfinish sentences and keep the response concise while still being informative.
208
+
209
  [CONTEXT]
210
  {context}
211
+
212
  [MEMORY]
213
  {memory_prompt}
214
+
215
  [QUESTION]
216
+ Answer and summarize the following question using fullly finish linesens end with., clear, and grammatically correct finish sentences. Ensure that the response is factually accurate, complete, well-organized, finish sentences and easy to understand. Avoid repeating information, unfinish sentences, and keep the response concise while still being informative.
217
  {question}
218
+
219
  [ANSWER]
220
  """
221
  try:
 
239
  def needs_web_search_llm(question):
240
  prompt = f"""
241
  You are a helpful assistant that classifies whether a question requires a web search or external data.
242
+
243
  Question: "{question}"
244
+
245
  Answer with only "YES" if a web search is needed or "NO" if not.
246
  """
247
  try:
248
  response = client.text_generation(prompt, max_new_tokens=10)
249
  return "YES" in response.strip().upper()
250
+ except Exception as e:
251
  return False
252
 
253
  def is_general_knowledge_question(question):
254
  prompt = f"""
255
  You are a classifier. Determine if the question below can be answered using general world knowledge, like an encyclopedia or Wikipedia.
256
+
257
  Question: "{question}"
258
+
259
  Answer with "YES" if it is general knowledge. Otherwise answer "NO".
260
  """
261
  try:
262
  response = client.text_generation(prompt, max_new_tokens=10)
263
  return "YES" in response.strip().upper()
264
+ except Exception as e:
265
  return False
266
 
267
  def get_wikipedia_summary(query, sentences=3):
 
272
  return f"Ambiguous question. Possible topics: {', '.join(e.options[:5])}"
273
  except wikipedia.exceptions.PageError:
274
  return "No Wikipedia article found for that topic."
275
+ except Exception as e:
276
  return "Error accessing Wikipedia."
277
 
278
+ # === Semantic Scholar API integration ===
279
  def semantic_scholar_search(query, max_results=5):
280
  params = {
281
  "query": query,
 
287
  resp.raise_for_status()
288
  data = resp.json()
289
  papers = data.get("data", [])
290
+ texts = []
291
+ urls = []
292
  for p in papers:
293
  title = p.get("title", "")
294
  abstract = p.get("abstract", "")
 
310
  return "", []
311
 
312
  def is_research_question(question):
313
+ # Simple heuristic to detect research/scientific questions
314
+ keywords = [
315
+ "research", "study", "paper", "findings", "experiment", "scientific", "evidence", "meta-analysis",
316
+ "hypothesis", "literature review", "case study", "theory", "framework", "methodology", "analysis",
317
+ "data", "observation", "results", "variables", "survey", "questionnaire", "sampling", "experiment design",
318
+ "quantitative", "qualitative", "mixed methods", "statistical", "inference", "regression", "correlation",
319
+ "interview", "focus group", "coding", "themes", "interpretation", "reliability", "validity", "bias",
320
+ "significance", "conclusion", "discussion", "implications", "limitations", "future research", "peer review",
321
+ "publication", "citation", "replication", "protocol", "ethics", "IRB", "research question", "objective",
322
+ "aim", "problem statement", "gap", "contribution", "novelty", "originality", "dataset", "case", "fieldwork",
323
+ "observational", "experimental", "review", "systematic review", "control group", "randomized", "longitudinal",
324
+ "cross-sectional", "data analysis", "research design", "conceptual", "empirical", "exploratory", "descriptive",
325
+ "causal", "predictive", "construct", "operationalization", "dependent variable", "independent variable",
326
+ "mediator", "moderator", "association", "impact", "effect", "relationship", "outcome", "measure", "coding scheme"
327
+ ]
328
+
329
  q_lower = question.lower()
330
+ return any(kw in q_lower for kw in keywords)
331
 
332
  def ask(q):
333
+ # Check if research/scientific question and use Semantic Scholar
334
  if is_research_question(q):
335
  context, sources = semantic_scholar_search(q)
336
  if context:
337
  answer, sources, _ = answer_from_context(q)
338
+ sources_text = "\n".join(f"- {url}" for url in sources)
339
+ return answer, sources_text
340
+ # fallback to regular web search if semantic scholar fails
341
  context, sources = scrape_and_save(q)
342
  answer, sources, _ = answer_from_context(q)
343
+ sources_text = "\n".join(f"- {url}" for url in sources)
344
+ return answer, sources_text
345
 
346
+ # General knowledge questions use Wikipedia
347
  if is_general_knowledge_question(q):
348
  return get_wikipedia_summary(q), "Source: Wikipedia"
349
 
350
+ # Check if we already have context stored with sufficient similarity
351
  _, _, avg_sim = retrieve_context_from_chunks(q)
352
+
353
+ # Check if web search is needed or context similarity too low
354
  intent_search = needs_web_search_llm(q)
355
 
356
  if intent_search or avg_sim < MIN_CONTEXT_SIMILARITY:
357
  context, sources = scrape_and_save(q)
358
  answer, sources, _ = answer_from_context(q)
359
+ sources_text = "\n".join(f"- {url}" for url in sources)
360
  else:
361
+ # Use model to answer from prompt only
362
+ prompt = f"<|user|>\n Answer and summarize the following question using fullly finish lines end with. , clear, and grammatically correct finish sentences. Ensure that the response is factually accurate, complete, well-organized, finish stances, and easy to understand. Avoid repeating information, unfinish sentences, and keep the response concise while still being informative.:\n{q.strip()}\n<|assistant|>\n"
363
  try:
364
  response = client.text_generation(prompt, max_new_tokens=512)
365
  answer = response.strip().split("<|assistant|>")[-1].strip()
366
  except Exception as e:
367
  answer = f"Error: {e}"
368
+ sources_text = ""
369
+
370
+ return answer, sources_text
371
 
372
  # === Gradio UI ===
373
  with gr.Blocks() as demo:
 
390
  question = " ".join(sys.argv[1:])
391
  print(ask(question))
392
  else:
393
+ demo.launch()