resberry commited on
Commit
ea1a635
·
verified ·
1 Parent(s): 5fb51ba

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +84 -213
app.py CHANGED
@@ -1,11 +1,6 @@
1
  # === Required Libraries ===
2
  from huggingface_hub import InferenceClient
3
- import os
4
- import sys
5
- import re
6
- import json
7
- import requests
8
- import logging
9
  from bs4 import BeautifulSoup
10
  from readability import Document
11
  from duckduckgo_search import DDGS
@@ -13,9 +8,7 @@ from concurrent.futures import ThreadPoolExecutor
13
  import gradio as gr
14
  from datetime import datetime, timedelta
15
  from sentence_transformers import SentenceTransformer
16
- import faiss
17
- import numpy as np
18
- import wikipedia
19
 
20
  # === Configuration ===
21
  HF_TOKEN = os.getenv("HF")
@@ -66,328 +59,206 @@ def current_iso_timestamp():
66
  return datetime.utcnow().isoformat()
67
 
68
  def save_chunks(query, chunks, urls):
69
- chunk_data = []
70
- now = current_iso_timestamp()
71
  for chunk in chunks:
72
  chunk_data.append({
73
- "query": query,
74
- "chunk": chunk,
75
  "embedding": embed(chunk).tolist(),
76
- "sources": urls,
77
- "timestamp": now
78
  })
 
79
  if os.path.exists(CHUNK_STORE):
80
  with open(CHUNK_STORE, "r") as f:
81
- existing = json.load(f)
82
- cutoff = datetime.utcnow() - timedelta(days=MAX_CHUNK_AGE_DAYS)
83
- existing = [c for c in existing if datetime.fromisoformat(c.get("timestamp", "1970-01-01T00:00:00")) > cutoff]
84
- else:
85
- existing = []
86
  existing.extend(chunk_data)
87
- with open(CHUNK_STORE, "w") as f:
88
  json.dump(existing, f, indent=2)
89
 
90
  def is_recent_chunk(ts):
91
  try:
92
- return (datetime.utcnow() - datetime.fromisoformat(ts)) < timedelta(days=MAX_CHUNK_AGE_DAYS)
93
  except:
94
  return False
95
 
96
  def retrieve_context_from_chunks(question, top_k=4):
97
  if not os.path.exists(CHUNK_STORE):
98
  return "", [], 0.0
99
- with open(CHUNK_STORE, "r") as f:
100
- data = json.load(f)
101
- data = [d for d in data if is_recent_chunk(d.get("timestamp", ""))]
102
  if not data:
103
  return "", [], 0.0
104
-
105
  embeddings = np.array([d['embedding'] for d in data]).astype('float32')
106
  dim = embeddings.shape[1]
107
  q_emb = embed(question).reshape(1, -1).astype('float32')
108
  if q_emb.shape[1] != dim:
109
  os.remove(CHUNK_STORE)
110
  return "", [], 0.0
111
-
112
  index = faiss.IndexFlatL2(dim)
113
  index.add(embeddings)
114
  distances, I = index.search(q_emb, top_k)
115
  top_chunks = [data[i]['chunk'] for i in I[0]]
116
  sources = list({src for i in I[0] for src in data[i]['sources']})
117
- similarities = 1 / (distances[0] + 1e-6)
118
- avg_sim = np.mean(similarities)
119
  return "\n\n".join(top_chunks), sources, avg_sim
120
 
121
  def fetch_text(url):
122
  try:
123
  r = requests.get(url, headers=HEADERS, timeout=10)
124
  doc = Document(r.text)
125
- soup = BeautifulSoup(doc.summary(), "html.parser")
126
- text = " ".join(p.get_text() for p in soup.find_all("p")).strip()
127
- return text, url
128
- except Exception as e:
129
  return "", url
130
 
131
  def scrape_and_save(query):
132
- filename = re.sub(r'[^a-zA-Z0-9_-]', '_', query)[:50] + ".json"
133
  filepath = os.path.join(CONTEXT_DIR, filename)
134
  if os.path.exists(filepath):
135
- with open(filepath, "r") as f:
136
- d = json.load(f)
137
  return d["context"], d["sources"]
138
-
139
  with DDGS() as ddgs:
140
  results = list(ddgs.text(query, max_results=MAX_RESULTS))
141
-
142
  urls = list({r['href'] for r in results if 'href' in r})
143
- with ThreadPoolExecutor(max_workers=MAX_RESULTS) as executor:
144
- fetched = list(executor.map(fetch_text, urls))
145
-
146
  texts, used_urls, total_chars = [], [], 0
147
  q_emb = embed(query)
148
  for text, url in fetched:
149
  if not text:
150
  continue
151
  if query.lower() not in text.lower():
152
- sim = cosine_similarity(q_emb, embed(text))
153
- if sim < 0.3:
154
  continue
155
  if total_chars + len(text) > MAX_CHARS:
156
- text = text[:MAX_CHARS - total_chars]
157
- texts.append(text)
158
- used_urls.append(url)
159
  total_chars += len(text)
160
  if total_chars >= MAX_CHARS:
161
  break
162
-
163
  context = "\n\n".join(texts)
164
- chunks = chunk_text(context)
165
- save_chunks(query, chunks, used_urls)
166
- with open(filepath, "w") as f:
167
- json.dump({"query": query, "context": context, "sources": used_urls}, f, indent=2)
168
  return context, used_urls
169
 
170
  def get_similar_memories(question, top_k=3):
171
  if not os.path.exists(EMBED_FILE):
172
  return []
173
- with open(EMBED_FILE, "r") as f:
174
- data = json.load(f)
175
  if not data:
176
  return []
177
-
178
  embeddings = np.array([m['embedding'] for m in data]).astype('float32')
179
- dim = embeddings.shape[1]
180
  q_emb = embed(question).reshape(1, -1).astype('float32')
181
- if q_emb.shape[1] != dim:
182
  os.remove(EMBED_FILE)
183
  return []
184
-
185
- index = faiss.IndexFlatL2(dim)
186
  index.add(embeddings)
187
  _, I = index.search(q_emb, top_k)
188
  return [data[i] for i in I[0]]
189
 
190
  def save_embedding_to_store(entry):
191
- if os.path.exists(EMBED_FILE):
192
- with open(EMBED_FILE, "r") as f:
193
- data = json.load(f)
194
- else:
195
- data = []
196
  data.append(entry)
197
- with open(EMBED_FILE, "w") as f:
198
- json.dump(data, f, indent=2)
 
 
 
199
 
200
  def answer_from_context(question):
201
  memory = get_similar_memories(question)
202
  memory_prompt = "\n\n".join(f"Q: {m['q']}\nA: {m['a']}" for m in memory)
203
  context, sources, avg_sim = retrieve_context_from_chunks(question)
204
-
205
- prompt = f"""
206
- Today's date is {datetime.utcnow().date()}.
207
- Use context and memory to answer and summarize the following question using fullly finished lines end with., clear, and grammatically correct finish sentences. Ensure that the response is factually accurate, complete, well-organized, finish sentences and easy to understand. Avoid repeating information,unfinish sentences and keep the response concise while still being informative.
208
-
209
- [CONTEXT]
210
- {context}
211
-
212
- [MEMORY]
213
- {memory_prompt}
214
-
215
- [QUESTION]
216
- Answer and summarize the following question using fullly finish linesens end with., clear, and grammatically correct finish sentences. Ensure that the response is factually accurate, complete, well-organized, finish sentences and easy to understand. Avoid repeating information, unfinish sentences, and keep the response concise while still being informative.
217
- {question}
218
-
219
- [ANSWER]
220
- """
221
- try:
222
- response = client.text_generation(prompt, max_new_tokens=512)
223
- reply = response.strip().split("<|assistant|>")[-1].strip()
224
- except Exception as e:
225
- reply = f"Error: {e}"
226
-
227
- log = {
228
- "time": str(datetime.utcnow()),
229
- "q": question,
230
- "a": reply,
231
- "sources": sources,
232
- "embedding": embed(question).tolist()
233
- }
234
- with open(LOG_FILE, "a") as f:
235
- f.write(json.dumps(log) + "\n")
236
  save_embedding_to_store(log)
237
  return reply, sources, avg_sim
238
 
239
  def needs_web_search_llm(question):
240
- prompt = f"""
241
- You are a helpful assistant that classifies whether a question requires a web search or external data.
242
-
243
- Question: "{question}"
244
-
245
- Answer with only "YES" if a web search is needed or "NO" if not.
246
- """
247
- try:
248
- response = client.text_generation(prompt, max_new_tokens=10)
249
- return "YES" in response.strip().upper()
250
- except Exception as e:
251
- return False
252
 
253
  def is_general_knowledge_question(question):
254
- prompt = f"""
255
- You are a classifier. Determine if the question below can be answered using general world knowledge, like an encyclopedia or Wikipedia.
256
-
257
- Question: "{question}"
258
-
259
- Answer with "YES" if it is general knowledge. Otherwise answer "NO".
260
- """
261
- try:
262
- response = client.text_generation(prompt, max_new_tokens=10)
263
- return "YES" in response.strip().upper()
264
- except Exception as e:
265
- return False
266
 
267
- def get_wikipedia_summary(query, sentences=3):
268
- try:
269
- wikipedia.set_lang("en")
270
- return wikipedia.summary(query, sentences=sentences)
271
- except wikipedia.exceptions.DisambiguationError as e:
272
- return f"Ambiguous question. Possible topics: {', '.join(e.options[:5])}"
273
- except wikipedia.exceptions.PageError:
274
- return "No Wikipedia article found for that topic."
275
- except Exception as e:
276
- return "Error accessing Wikipedia."
277
-
278
- # === Semantic Scholar API integration ===
279
  def semantic_scholar_search(query, max_results=5):
280
- params = {
281
- "query": query,
282
- "fields": SEMANTIC_SCHOLAR_FIELDS,
283
- "limit": max_results
284
- }
285
  try:
286
- resp = requests.get(SEMANTIC_SCHOLAR_API, params=params, timeout=10)
287
- resp.raise_for_status()
288
- data = resp.json()
289
- papers = data.get("data", [])
290
- texts = []
291
- urls = []
292
- for p in papers:
293
- title = p.get("title", "")
294
- abstract = p.get("abstract", "")
295
- url = p.get("url", "")
296
- year = p.get("year", "")
297
- authors = ", ".join([a.get("name","") for a in p.get("authors", [])])
298
- entry = f"Title: {title}\nAuthors: {authors}\nYear: {year}\nAbstract: {abstract}\nURL: {url}\n"
299
- texts.append(entry)
300
- if url:
301
- urls.append(url)
302
  if len("\n\n".join(texts)) > MAX_CHARS:
303
  break
304
- context = "\n\n".join(texts)
305
- chunks = chunk_text(context)
306
- save_chunks(query, chunks, urls)
307
- return context, urls
308
- except Exception as e:
309
- logging.warning(f"Semantic Scholar API error: {e}")
310
  return "", []
311
 
312
  def is_research_question(question):
313
- # Simple heuristic to detect research/scientific questions
314
  keywords = [
315
- "research", "study", "paper", "findings", "experiment", "scientific", "evidence", "meta-analysis",
316
- "hypothesis", "literature review", "case study", "theory", "framework", "methodology", "analysis",
317
- "data", "observation", "results", "variables", "survey", "questionnaire", "sampling", "experiment design",
318
- "quantitative", "qualitative", "mixed methods", "statistical", "inference", "regression", "correlation",
319
- "interview", "focus group", "coding", "themes", "interpretation", "reliability", "validity", "bias",
320
- "significance", "conclusion", "discussion", "implications", "limitations", "future research", "peer review",
321
- "publication", "citation", "replication", "protocol", "ethics", "IRB", "research question", "objective",
322
- "aim", "problem statement", "gap", "contribution", "novelty", "originality", "dataset", "case", "fieldwork",
323
- "observational", "experimental", "review", "systematic review", "control group", "randomized", "longitudinal",
324
- "cross-sectional", "data analysis", "research design", "conceptual", "empirical", "exploratory", "descriptive",
325
- "causal", "predictive", "construct", "operationalization", "dependent variable", "independent variable",
326
- "mediator", "moderator", "association", "impact", "effect", "relationship", "outcome", "measure", "coding scheme"
327
  ]
328
 
329
- q_lower = question.lower()
330
- return any(kw in q_lower for kw in keywords)
331
 
332
  def ask(q):
333
- # Check if research/scientific question and use Semantic Scholar
334
  if is_research_question(q):
335
  context, sources = semantic_scholar_search(q)
336
  if context:
337
- answer, sources, _ = answer_from_context(q)
338
- sources_text = "\n".join(f"- {url}" for url in sources)
339
- return answer, sources_text
340
- # fallback to regular web search if semantic scholar fails
341
- context, sources = scrape_and_save(q)
342
- answer, sources, _ = answer_from_context(q)
343
- sources_text = "\n".join(f"- {url}" for url in sources)
344
- return answer, sources_text
345
-
346
- # General knowledge questions use Wikipedia
347
  if is_general_knowledge_question(q):
348
- return get_wikipedia_summary(q), "Source: Wikipedia"
349
-
350
- # Check if we already have context stored with sufficient similarity
351
  _, _, avg_sim = retrieve_context_from_chunks(q)
352
-
353
- # Check if web search is needed or context similarity too low
354
- intent_search = needs_web_search_llm(q)
355
-
356
- if intent_search or avg_sim < MIN_CONTEXT_SIMILARITY:
357
- context, sources = scrape_and_save(q)
358
- answer, sources, _ = answer_from_context(q)
359
- sources_text = "\n".join(f"- {url}" for url in sources)
360
- else:
361
- # Use model to answer from prompt only
362
- prompt = f"<|user|>\n Answer and summarize the following question using fullly finish lines end with. , clear, and grammatically correct finish sentences. Ensure that the response is factually accurate, complete, well-organized, finish stances, and easy to understand. Avoid repeating information, unfinish sentences, and keep the response concise while still being informative.:\n{q.strip()}\n<|assistant|>\n"
363
- try:
364
- response = client.text_generation(prompt, max_new_tokens=512)
365
- answer = response.strip().split("<|assistant|>")[-1].strip()
366
- except Exception as e:
367
- answer = f"Error: {e}"
368
- sources_text = ""
369
-
370
- return answer, sources_text
371
 
372
  # === Gradio UI ===
373
  with gr.Blocks() as demo:
374
  gr.Markdown("""
375
  ## 🤖 LLaMA 3.1 Smart QA Bot
376
  - Uses **Wikipedia** for general knowledge
377
- - Searches **Semantic Scholar** for research-related questions
378
  - Falls back to web search when needed
379
- - Can handle **casual chat** too!
380
  """)
381
-
382
  q_input = gr.Textbox(label="Your Question")
383
  submit = gr.Button("Ask")
384
  a_output = gr.Textbox(label="Answer")
385
  s_output = gr.Markdown()
386
  submit.click(ask, inputs=q_input, outputs=[a_output, s_output])
387
 
388
- if __name__ == '__main__':
389
  if len(sys.argv) > 1:
390
  question = " ".join(sys.argv[1:])
391
  print(ask(question))
392
  else:
393
- demo.launch()
 
1
  # === Required Libraries ===
2
  from huggingface_hub import InferenceClient
3
+ import os, sys, re, json, requests, logging
 
 
 
 
 
4
  from bs4 import BeautifulSoup
5
  from readability import Document
6
  from duckduckgo_search import DDGS
 
8
  import gradio as gr
9
  from datetime import datetime, timedelta
10
  from sentence_transformers import SentenceTransformer
11
+ import faiss, numpy as np, wikipedia
 
 
12
 
13
  # === Configuration ===
14
  HF_TOKEN = os.getenv("HF")
 
59
  return datetime.utcnow().isoformat()
60
 
61
  def save_chunks(query, chunks, urls):
62
+ chunk_data, now = [], current_iso_timestamp()
 
63
  for chunk in chunks:
64
  chunk_data.append({
65
+ "query": query, "chunk": chunk,
 
66
  "embedding": embed(chunk).tolist(),
67
+ "sources": urls, "timestamp": now
 
68
  })
69
+ existing = []
70
  if os.path.exists(CHUNK_STORE):
71
  with open(CHUNK_STORE, "r") as f:
72
+ existing = [c for c in json.load(f)
73
+ if datetime.fromisoformat(c.get("timestamp","1970-01-01T00:00:00"))
74
+ > datetime.utcnow() - timedelta(days=MAX_CHUNK_AGE_DAYS)]
 
 
75
  existing.extend(chunk_data)
76
+ with open(CHUNK_STORE,"w") as f:
77
  json.dump(existing, f, indent=2)
78
 
79
  def is_recent_chunk(ts):
80
  try:
81
+ return datetime.utcnow() - datetime.fromisoformat(ts) < timedelta(days=MAX_CHUNK_AGE_DAYS)
82
  except:
83
  return False
84
 
85
  def retrieve_context_from_chunks(question, top_k=4):
86
  if not os.path.exists(CHUNK_STORE):
87
  return "", [], 0.0
88
+ with open(CHUNK_STORE,"r") as f:
89
+ data = [d for d in json.load(f) if is_recent_chunk(d.get("timestamp",""))]
 
90
  if not data:
91
  return "", [], 0.0
 
92
  embeddings = np.array([d['embedding'] for d in data]).astype('float32')
93
  dim = embeddings.shape[1]
94
  q_emb = embed(question).reshape(1, -1).astype('float32')
95
  if q_emb.shape[1] != dim:
96
  os.remove(CHUNK_STORE)
97
  return "", [], 0.0
 
98
  index = faiss.IndexFlatL2(dim)
99
  index.add(embeddings)
100
  distances, I = index.search(q_emb, top_k)
101
  top_chunks = [data[i]['chunk'] for i in I[0]]
102
  sources = list({src for i in I[0] for src in data[i]['sources']})
103
+ avg_sim = np.mean(1 / (distances[0] + 1e-6))
 
104
  return "\n\n".join(top_chunks), sources, avg_sim
105
 
106
  def fetch_text(url):
107
  try:
108
  r = requests.get(url, headers=HEADERS, timeout=10)
109
  doc = Document(r.text)
110
+ text = " ".join(p.get_text() for p in BeautifulSoup(doc.summary(), "html.parser").find_all("p"))
111
+ return text.strip(), url
112
+ except:
 
113
  return "", url
114
 
115
  def scrape_and_save(query):
116
+ filename = re.sub(r'[^a-zA-Z0-9_-]','_',query)[:50]+".json"
117
  filepath = os.path.join(CONTEXT_DIR, filename)
118
  if os.path.exists(filepath):
119
+ d = json.load(open(filepath,"r"))
 
120
  return d["context"], d["sources"]
 
121
  with DDGS() as ddgs:
122
  results = list(ddgs.text(query, max_results=MAX_RESULTS))
 
123
  urls = list({r['href'] for r in results if 'href' in r})
124
+ fetched = ThreadPoolExecutor(MAX_RESULTS).map(fetch_text, urls)
 
 
125
  texts, used_urls, total_chars = [], [], 0
126
  q_emb = embed(query)
127
  for text, url in fetched:
128
  if not text:
129
  continue
130
  if query.lower() not in text.lower():
131
+ if cosine_similarity(q_emb, embed(text)) < 0.3:
 
132
  continue
133
  if total_chars + len(text) > MAX_CHARS:
134
+ text = text[:MAX_CHARS-total_chars]
135
+ texts.append(text); used_urls.append(url)
 
136
  total_chars += len(text)
137
  if total_chars >= MAX_CHARS:
138
  break
 
139
  context = "\n\n".join(texts)
140
+ save_chunks(query, chunk_text(context), used_urls)
141
+ open(filepath,"w").write(json.dumps({"query":query,"context":context,"sources":used_urls}, indent=2))
 
 
142
  return context, used_urls
143
 
144
  def get_similar_memories(question, top_k=3):
145
  if not os.path.exists(EMBED_FILE):
146
  return []
147
+ data = json.load(open(EMBED_FILE,"r"))
 
148
  if not data:
149
  return []
 
150
  embeddings = np.array([m['embedding'] for m in data]).astype('float32')
 
151
  q_emb = embed(question).reshape(1, -1).astype('float32')
152
+ if q_emb.shape[1] != embeddings.shape[1]:
153
  os.remove(EMBED_FILE)
154
  return []
155
+ index = faiss.IndexFlatL2(embeddings.shape[1])
 
156
  index.add(embeddings)
157
  _, I = index.search(q_emb, top_k)
158
  return [data[i] for i in I[0]]
159
 
160
  def save_embedding_to_store(entry):
161
+ data = json.load(open(EMBED_FILE,"r")) if os.path.exists(EMBED_FILE) else []
 
 
 
 
162
  data.append(entry)
163
+ open(EMBED_FILE,"w").write(json.dumps(data, indent=2))
164
+
165
+ def call_conversational(messages, max_new_tokens):
166
+ resp = client.conversational(messages=messages, max_new_tokens=max_new_tokens)
167
+ return resp[-1]["content"].strip()
168
 
169
  def answer_from_context(question):
170
  memory = get_similar_memories(question)
171
  memory_prompt = "\n\n".join(f"Q: {m['q']}\nA: {m['a']}" for m in memory)
172
  context, sources, avg_sim = retrieve_context_from_chunks(question)
173
+ prompt = f"Today's date is {datetime.utcnow().date()}.\n\nContext:\n{context}\n\nMemory:\n{memory_prompt}\n\nQuestion:\n{question}\n\nAnswer concisely, clearly, and grammatically:"
174
+ reply = call_conversational([{"role":"user","content":prompt}], max_new_tokens=512)
175
+ log = {"time":str(datetime.utcnow()), "q":question, "a":reply, "sources":sources, "embedding":embed(question).tolist()}
176
+ open(LOG_FILE,"a").write(json.dumps(log)+"\n")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
177
  save_embedding_to_store(log)
178
  return reply, sources, avg_sim
179
 
180
  def needs_web_search_llm(question):
181
+ prompt = f'Does this need a web search? "{question}" Answer only YES or NO.'
182
+ resp = call_conversational([{"role":"user","content":prompt}], max_new_tokens=10)
183
+ return "YES" in resp.upper()
 
 
 
 
 
 
 
 
 
184
 
185
  def is_general_knowledge_question(question):
186
+ prompt = f'Can this be answered with general knowledge (e.g., encyclopedia)? "{question}" YES or NO.'
187
+ resp = call_conversational([{"role":"user","content":prompt}], max_new_tokens=10)
188
+ return "YES" in resp.upper()
 
 
 
 
 
 
 
 
 
189
 
 
 
 
 
 
 
 
 
 
 
 
 
190
  def semantic_scholar_search(query, max_results=5):
191
+ params = {"query":query, "fields":SEMANTIC_SCHOLAR_FIELDS, "limit":max_results}
 
 
 
 
192
  try:
193
+ resp = requests.get(SEMANTIC_SCHOLAR_API, params=params, timeout=10); resp.raise_for_status()
194
+ texts, urls = [], []
195
+ for p in resp.json().get("data", []):
196
+ entry = f"Title: {p.get('title','')}\nAuthors: {', '.join(a['name'] for a in p.get('authors',[]))}\nYear: {p.get('year','')}\nAbstract: {p.get('abstract','')}\nURL: {p.get('url','')}"
197
+ texts.append(entry); urls.append(p.get('url',''))
 
 
 
 
 
 
 
 
 
 
 
198
  if len("\n\n".join(texts)) > MAX_CHARS:
199
  break
200
+ save_chunks(query, chunk_text("\n\n".join(texts)), urls)
201
+ return "\n\n".join(texts), urls
202
+ except Exception:
203
+ logging.warning("Semantic Scholar API error")
 
 
204
  return "", []
205
 
206
  def is_research_question(question):
 
207
  keywords = [
208
+ "research", "study", "paper", "findings", "experiment", "scientific", "evidence", "meta-analysis",
209
+ "hypothesis", "literature review", "case study", "theory", "framework", "methodology", "analysis",
210
+ "data", "observation", "results", "variables", "survey", "questionnaire", "sampling", "experiment design",
211
+ "quantitative", "qualitative", "mixed methods", "statistical", "inference", "regression", "correlation",
212
+ "interview", "focus group", "coding", "themes", "interpretation", "reliability", "validity", "bias",
213
+ "significance", "conclusion", "discussion", "implications", "limitations", "future research", "peer review",
214
+ "publication", "citation", "replication", "protocol", "ethics", "IRB", "research question", "objective",
215
+ "aim", "problem statement", "gap", "contribution", "novelty", "originality", "dataset", "case", "fieldwork",
216
+ "observational", "experimental", "review", "systematic review", "control group", "randomized", "longitudinal",
217
+ "cross-sectional", "data analysis", "research design", "conceptual", "empirical", "exploratory", "descriptive",
218
+ "causal", "predictive", "construct", "operationalization", "dependent variable", "independent variable",
219
+ "mediator", "moderator", "association", "impact", "effect", "relationship", "outcome", "measure", "coding scheme"
220
  ]
221
 
222
+ return any(kw in question.lower() for kw in keywords)
 
223
 
224
  def ask(q):
 
225
  if is_research_question(q):
226
  context, sources = semantic_scholar_search(q)
227
  if context:
228
+ answer, sources_used, _ = answer_from_context(q)
229
+ else:
230
+ context, sources = scrape_and_save(q)
231
+ answer, sources_used, _ = answer_from_context(q)
232
+ return answer, "\n".join(f"- {u}" for u in sources_used)
 
 
 
 
 
233
  if is_general_knowledge_question(q):
234
+ return wikipedia.summary(q, sentences=3), "Source: Wikipedia"
 
 
235
  _, _, avg_sim = retrieve_context_from_chunks(q)
236
+ if needs_web_search_llm(q) or avg_sim < MIN_CONTEXT_SIMILARITY:
237
+ scrape_context, sources = scrape_and_save(q)
238
+ answer, sources_used, _ = answer_from_context(q)
239
+ return answer, "\n".join(f"- {u}" for u in sources_used)
240
+ prompt = q.strip()
241
+ answer = call_conversational([{"role":"user","content":prompt}], max_new_tokens=512)
242
+ return answer, ""
 
 
 
 
 
 
 
 
 
 
 
 
243
 
244
  # === Gradio UI ===
245
  with gr.Blocks() as demo:
246
  gr.Markdown("""
247
  ## 🤖 LLaMA 3.1 Smart QA Bot
248
  - Uses **Wikipedia** for general knowledge
249
+ - Searches **Semantic Scholar** for research
250
  - Falls back to web search when needed
251
+ - Supports casual chat!
252
  """)
 
253
  q_input = gr.Textbox(label="Your Question")
254
  submit = gr.Button("Ask")
255
  a_output = gr.Textbox(label="Answer")
256
  s_output = gr.Markdown()
257
  submit.click(ask, inputs=q_input, outputs=[a_output, s_output])
258
 
259
+ if __name__ == "__main__":
260
  if len(sys.argv) > 1:
261
  question = " ".join(sys.argv[1:])
262
  print(ask(question))
263
  else:
264
+ demo.launch()