Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
|
@@ -125,7 +125,7 @@ def fetch_text(url):
|
|
| 125 |
soup = BeautifulSoup(doc.summary(), "html.parser")
|
| 126 |
text = " ".join(p.get_text() for p in soup.find_all("p")).strip()
|
| 127 |
return text, url
|
| 128 |
-
except
|
| 129 |
return "", url
|
| 130 |
|
| 131 |
def scrape_and_save(query):
|
|
@@ -204,18 +204,13 @@ def answer_from_context(question):
|
|
| 204 |
|
| 205 |
prompt = f"""
|
| 206 |
Today's date is {datetime.utcnow().date()}.
|
| 207 |
-
Use context and memory to answer and summarize the following question using
|
| 208 |
-
|
| 209 |
[CONTEXT]
|
| 210 |
{context}
|
| 211 |
-
|
| 212 |
[MEMORY]
|
| 213 |
{memory_prompt}
|
| 214 |
-
|
| 215 |
[QUESTION]
|
| 216 |
-
Answer and summarize the following question using fullly finish linesens end with., clear, and grammatically correct finish sentences. Ensure that the response is factually accurate, complete, well-organized, finish sentences and easy to understand. Avoid repeating information, unfinish sentences, and keep the response concise while still being informative.
|
| 217 |
{question}
|
| 218 |
-
|
| 219 |
[ANSWER]
|
| 220 |
"""
|
| 221 |
try:
|
|
@@ -239,29 +234,25 @@ Answer and summarize the following question using fullly finish linesens end wit
|
|
| 239 |
def needs_web_search_llm(question):
|
| 240 |
prompt = f"""
|
| 241 |
You are a helpful assistant that classifies whether a question requires a web search or external data.
|
| 242 |
-
|
| 243 |
Question: "{question}"
|
| 244 |
-
|
| 245 |
Answer with only "YES" if a web search is needed or "NO" if not.
|
| 246 |
"""
|
| 247 |
try:
|
| 248 |
response = client.text_generation(prompt, max_new_tokens=10)
|
| 249 |
return "YES" in response.strip().upper()
|
| 250 |
-
except Exception
|
| 251 |
return False
|
| 252 |
|
| 253 |
def is_general_knowledge_question(question):
|
| 254 |
prompt = f"""
|
| 255 |
You are a classifier. Determine if the question below can be answered using general world knowledge, like an encyclopedia or Wikipedia.
|
| 256 |
-
|
| 257 |
Question: "{question}"
|
| 258 |
-
|
| 259 |
Answer with "YES" if it is general knowledge. Otherwise answer "NO".
|
| 260 |
"""
|
| 261 |
try:
|
| 262 |
response = client.text_generation(prompt, max_new_tokens=10)
|
| 263 |
return "YES" in response.strip().upper()
|
| 264 |
-
except Exception
|
| 265 |
return False
|
| 266 |
|
| 267 |
def get_wikipedia_summary(query, sentences=3):
|
|
@@ -272,10 +263,9 @@ def get_wikipedia_summary(query, sentences=3):
|
|
| 272 |
return f"Ambiguous question. Possible topics: {', '.join(e.options[:5])}"
|
| 273 |
except wikipedia.exceptions.PageError:
|
| 274 |
return "No Wikipedia article found for that topic."
|
| 275 |
-
except Exception
|
| 276 |
return "Error accessing Wikipedia."
|
| 277 |
|
| 278 |
-
# === Semantic Scholar API integration ===
|
| 279 |
def semantic_scholar_search(query, max_results=5):
|
| 280 |
params = {
|
| 281 |
"query": query,
|
|
@@ -287,8 +277,7 @@ def semantic_scholar_search(query, max_results=5):
|
|
| 287 |
resp.raise_for_status()
|
| 288 |
data = resp.json()
|
| 289 |
papers = data.get("data", [])
|
| 290 |
-
texts = []
|
| 291 |
-
urls = []
|
| 292 |
for p in papers:
|
| 293 |
title = p.get("title", "")
|
| 294 |
abstract = p.get("abstract", "")
|
|
@@ -310,64 +299,38 @@ def semantic_scholar_search(query, max_results=5):
|
|
| 310 |
return "", []
|
| 311 |
|
| 312 |
def is_research_question(question):
|
| 313 |
-
|
| 314 |
-
keywords = [
|
| 315 |
-
"research", "study", "paper", "findings", "experiment", "scientific", "evidence", "meta-analysis",
|
| 316 |
-
"hypothesis", "literature review", "case study", "theory", "framework", "methodology", "analysis",
|
| 317 |
-
"data", "observation", "results", "variables", "survey", "questionnaire", "sampling", "experiment design",
|
| 318 |
-
"quantitative", "qualitative", "mixed methods", "statistical", "inference", "regression", "correlation",
|
| 319 |
-
"interview", "focus group", "coding", "themes", "interpretation", "reliability", "validity", "bias",
|
| 320 |
-
"significance", "conclusion", "discussion", "implications", "limitations", "future research", "peer review",
|
| 321 |
-
"publication", "citation", "replication", "protocol", "ethics", "IRB", "research question", "objective",
|
| 322 |
-
"aim", "problem statement", "gap", "contribution", "novelty", "originality", "dataset", "case", "fieldwork",
|
| 323 |
-
"observational", "experimental", "review", "systematic review", "control group", "randomized", "longitudinal",
|
| 324 |
-
"cross-sectional", "data analysis", "research design", "conceptual", "empirical", "exploratory", "descriptive",
|
| 325 |
-
"causal", "predictive", "construct", "operationalization", "dependent variable", "independent variable",
|
| 326 |
-
"mediator", "moderator", "association", "impact", "effect", "relationship", "outcome", "measure", "coding scheme"
|
| 327 |
-
]
|
| 328 |
-
|
| 329 |
q_lower = question.lower()
|
| 330 |
-
return any(kw in q_lower for kw in
|
| 331 |
|
| 332 |
def ask(q):
|
| 333 |
-
# Check if research/scientific question and use Semantic Scholar
|
| 334 |
if is_research_question(q):
|
| 335 |
context, sources = semantic_scholar_search(q)
|
| 336 |
if context:
|
| 337 |
answer, sources, _ = answer_from_context(q)
|
| 338 |
-
|
| 339 |
-
return answer, sources_text
|
| 340 |
-
# fallback to regular web search if semantic scholar fails
|
| 341 |
context, sources = scrape_and_save(q)
|
| 342 |
answer, sources, _ = answer_from_context(q)
|
| 343 |
-
|
| 344 |
-
return answer, sources_text
|
| 345 |
|
| 346 |
-
# General knowledge questions use Wikipedia
|
| 347 |
if is_general_knowledge_question(q):
|
| 348 |
return get_wikipedia_summary(q), "Source: Wikipedia"
|
| 349 |
|
| 350 |
-
# Check if we already have context stored with sufficient similarity
|
| 351 |
_, _, avg_sim = retrieve_context_from_chunks(q)
|
| 352 |
-
|
| 353 |
-
# Check if web search is needed or context similarity too low
|
| 354 |
intent_search = needs_web_search_llm(q)
|
| 355 |
|
| 356 |
if intent_search or avg_sim < MIN_CONTEXT_SIMILARITY:
|
| 357 |
context, sources = scrape_and_save(q)
|
| 358 |
answer, sources, _ = answer_from_context(q)
|
| 359 |
-
|
| 360 |
else:
|
| 361 |
-
|
| 362 |
-
prompt = f"<|user|>\n Answer and summarize the following question using fullly finish lines end with. , clear, and grammatically correct finish sentences. Ensure that the response is factually accurate, complete, well-organized, finish stances, and easy to understand. Avoid repeating information, unfinish sentences, and keep the response concise while still being informative.:\n{q.strip()}\n<|assistant|>\n"
|
| 363 |
try:
|
| 364 |
response = client.text_generation(prompt, max_new_tokens=512)
|
| 365 |
answer = response.strip().split("<|assistant|>")[-1].strip()
|
| 366 |
except Exception as e:
|
| 367 |
answer = f"Error: {e}"
|
| 368 |
-
|
| 369 |
-
|
| 370 |
-
return answer, sources_text
|
| 371 |
|
| 372 |
# === Gradio UI ===
|
| 373 |
with gr.Blocks() as demo:
|
|
@@ -390,4 +353,4 @@ if __name__ == '__main__':
|
|
| 390 |
question = " ".join(sys.argv[1:])
|
| 391 |
print(ask(question))
|
| 392 |
else:
|
| 393 |
-
demo.launch()
|
|
|
|
| 125 |
soup = BeautifulSoup(doc.summary(), "html.parser")
|
| 126 |
text = " ".join(p.get_text() for p in soup.find_all("p")).strip()
|
| 127 |
return text, url
|
| 128 |
+
except:
|
| 129 |
return "", url
|
| 130 |
|
| 131 |
def scrape_and_save(query):
|
|
|
|
| 204 |
|
| 205 |
prompt = f"""
|
| 206 |
Today's date is {datetime.utcnow().date()}.
|
| 207 |
+
Use context and memory to answer and summarize the following question using fully finished lines, clear, and grammatically correct finished sentences. Ensure the response is factually accurate, complete, well-organized, and easy to understand. Avoid repeating information, unfinished sentences, and keep it concise but informative.
|
|
|
|
| 208 |
[CONTEXT]
|
| 209 |
{context}
|
|
|
|
| 210 |
[MEMORY]
|
| 211 |
{memory_prompt}
|
|
|
|
| 212 |
[QUESTION]
|
|
|
|
| 213 |
{question}
|
|
|
|
| 214 |
[ANSWER]
|
| 215 |
"""
|
| 216 |
try:
|
|
|
|
| 234 |
def needs_web_search_llm(question):
|
| 235 |
prompt = f"""
|
| 236 |
You are a helpful assistant that classifies whether a question requires a web search or external data.
|
|
|
|
| 237 |
Question: "{question}"
|
|
|
|
| 238 |
Answer with only "YES" if a web search is needed or "NO" if not.
|
| 239 |
"""
|
| 240 |
try:
|
| 241 |
response = client.text_generation(prompt, max_new_tokens=10)
|
| 242 |
return "YES" in response.strip().upper()
|
| 243 |
+
except Exception:
|
| 244 |
return False
|
| 245 |
|
| 246 |
def is_general_knowledge_question(question):
|
| 247 |
prompt = f"""
|
| 248 |
You are a classifier. Determine if the question below can be answered using general world knowledge, like an encyclopedia or Wikipedia.
|
|
|
|
| 249 |
Question: "{question}"
|
|
|
|
| 250 |
Answer with "YES" if it is general knowledge. Otherwise answer "NO".
|
| 251 |
"""
|
| 252 |
try:
|
| 253 |
response = client.text_generation(prompt, max_new_tokens=10)
|
| 254 |
return "YES" in response.strip().upper()
|
| 255 |
+
except Exception:
|
| 256 |
return False
|
| 257 |
|
| 258 |
def get_wikipedia_summary(query, sentences=3):
|
|
|
|
| 263 |
return f"Ambiguous question. Possible topics: {', '.join(e.options[:5])}"
|
| 264 |
except wikipedia.exceptions.PageError:
|
| 265 |
return "No Wikipedia article found for that topic."
|
| 266 |
+
except Exception:
|
| 267 |
return "Error accessing Wikipedia."
|
| 268 |
|
|
|
|
| 269 |
def semantic_scholar_search(query, max_results=5):
|
| 270 |
params = {
|
| 271 |
"query": query,
|
|
|
|
| 277 |
resp.raise_for_status()
|
| 278 |
data = resp.json()
|
| 279 |
papers = data.get("data", [])
|
| 280 |
+
texts, urls = [], []
|
|
|
|
| 281 |
for p in papers:
|
| 282 |
title = p.get("title", "")
|
| 283 |
abstract = p.get("abstract", "")
|
|
|
|
| 299 |
return "", []
|
| 300 |
|
| 301 |
def is_research_question(question):
|
| 302 |
+
research_keywords = ["study", "paper", "evidence", "method", "experiment", "data", "results", "findings", "theory", "analysis"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 303 |
q_lower = question.lower()
|
| 304 |
+
return any(kw in q_lower for kw in research_keywords)
|
| 305 |
|
| 306 |
def ask(q):
|
|
|
|
| 307 |
if is_research_question(q):
|
| 308 |
context, sources = semantic_scholar_search(q)
|
| 309 |
if context:
|
| 310 |
answer, sources, _ = answer_from_context(q)
|
| 311 |
+
return answer, "\n".join(f"- {url}" for url in sources)
|
|
|
|
|
|
|
| 312 |
context, sources = scrape_and_save(q)
|
| 313 |
answer, sources, _ = answer_from_context(q)
|
| 314 |
+
return answer, "\n".join(f"- {url}" for url in sources)
|
|
|
|
| 315 |
|
|
|
|
| 316 |
if is_general_knowledge_question(q):
|
| 317 |
return get_wikipedia_summary(q), "Source: Wikipedia"
|
| 318 |
|
|
|
|
| 319 |
_, _, avg_sim = retrieve_context_from_chunks(q)
|
|
|
|
|
|
|
| 320 |
intent_search = needs_web_search_llm(q)
|
| 321 |
|
| 322 |
if intent_search or avg_sim < MIN_CONTEXT_SIMILARITY:
|
| 323 |
context, sources = scrape_and_save(q)
|
| 324 |
answer, sources, _ = answer_from_context(q)
|
| 325 |
+
return answer, "\n".join(f"- {url}" for url in sources)
|
| 326 |
else:
|
| 327 |
+
prompt = f"<|user|>\n{q.strip()}\n<|assistant|>\n"
|
|
|
|
| 328 |
try:
|
| 329 |
response = client.text_generation(prompt, max_new_tokens=512)
|
| 330 |
answer = response.strip().split("<|assistant|>")[-1].strip()
|
| 331 |
except Exception as e:
|
| 332 |
answer = f"Error: {e}"
|
| 333 |
+
return answer, ""
|
|
|
|
|
|
|
| 334 |
|
| 335 |
# === Gradio UI ===
|
| 336 |
with gr.Blocks() as demo:
|
|
|
|
| 353 |
question = " ".join(sys.argv[1:])
|
| 354 |
print(ask(question))
|
| 355 |
else:
|
| 356 |
+
demo.launch()
|