selim-ba commited on
Commit
5cae2c0
·
verified ·
1 Parent(s): 2abd52d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +455 -119
app.py CHANGED
@@ -35,6 +35,7 @@ DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
35
  # --- Constants ---
36
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
37
 
 
38
  class SuperSmartAgent:
39
  def __init__(self):
40
  self.graph = self._build_graph()
@@ -122,14 +123,17 @@ class SuperSmartAgent:
122
 
123
  def check_wikipedia_suitability(state):
124
  q = state["question"].lower()
125
- triggers = ["wikipedia","Wikipedia","who is", "what is", "when did", "where is", "tell me about", "how many"]
 
 
 
 
126
  state["is_wiki"] = any(trigger in q for trigger in triggers)
127
  return state
128
 
129
  def search_wikipedia(state):
130
  question = state["question"]
131
  try:
132
- # Use wikipedia library's search instead of wikipediaapi
133
  page_titles = wikipedia.search(question)
134
  if not page_titles:
135
  state["response"] = "No relevant Wikipedia article found."
@@ -141,8 +145,59 @@ class SuperSmartAgent:
141
  state["response"] = f"Error fetching Wikipedia content: {e}"
142
  return state
143
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
  def preprocess_context(context):
145
- context = re.sub(r'\[\d+\]', '', context)
146
  context = re.sub(r'\s+', ' ', context).strip()
147
  context = re.sub(r'\{\|.*?\|\}', '', context, flags=re.DOTALL)
148
  return context
@@ -162,105 +217,424 @@ class SuperSmartAgent:
162
 
163
  def general_reasoning_qa(state):
164
  question = state["question"]
165
- # Step 1: Search Wikipedia for relevant pages
166
  try:
167
- # Use wikipedia library for search functionality
168
  search_results = wikipedia.search(question, results=3)
169
- context = ""
170
-
171
- # Use wikipediaapi to get full content for each result
172
- for title in search_results:
173
- try:
174
- page = self.wiki_wiki.page(title)
175
- if page.exists():
176
- context += f"\n\n=== Content from: {title} ===\n\n"
177
- context += page.text
178
- except Exception as e:
179
- print(f"Error processing page {title}: {e}")
180
- continue
181
 
 
182
  if not context:
183
  state["response"] = "Sorry, I couldn't find relevant information."
184
  return state
185
 
186
  # Preprocess the context
187
- context = preprocess_context(context)
188
-
189
- # Step 2: Extract key phrases from the question
190
- key_phrases = extract_key_phrases(question)
191
 
192
- # Step 3: Find relevant sections in the context
193
- relevant_sections = []
194
- sections = re.split(r'\n\s*\n', context)
195
-
196
- for section in sections:
197
- if any(phrase.lower() in section.lower() for phrase in key_phrases):
198
- relevant_sections.append(section)
199
-
200
- if not relevant_sections:
201
- state["response"] = "I found information but couldn't identify the most relevant parts."
202
- return state
203
 
204
- # Combine relevant sections
205
- relevant_context = "\n\n".join(relevant_sections)
206
 
207
- # Step 4: Simple answer extraction based on patterns
208
- answer = self.extract_answer(question, relevant_context)
209
  if answer:
210
  state["response"] = answer
211
  else:
212
  try:
213
- if search_results:
214
- first_page = self.wiki_wiki.page(search_results[0])
215
- if first_page.exists():
216
- summary = first_page.summary[:500] + "..." # Limit summary length
217
- state["response"] = f"I couldn't find a specific answer, but here's some relevant information: {summary}"
218
- else:
219
- state["response"] = "No relevant information found."
220
  except Exception as e:
221
  state["response"] = f"I couldn't find a specific answer in the available information."
222
  except Exception as e:
223
  state["response"] = f"An error occurred while searching for information: {str(e)}"
224
  return state
225
 
226
- def extract_answer(question, context):
227
- """Simple heuristic-based answer extraction"""
228
- if re.search(r'\bhow many\b', question.lower()):
229
- numbers = re.findall(r'\d+', context)
230
- if numbers:
231
- return f"The answer is {numbers[0]}."
232
- elif re.search(r'\bwhen (did|was|were)\b', question.lower()):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
233
  years = re.findall(r'\b(19|20)\d{2}\b', context)
234
- if years:
235
- return f"The answer is {years[0]}."
236
- elif re.search(r'\bwho (is|was)\b', question.lower()):
237
- names = re.findall(r'\b[A-Z][a-z]+ [A-Z][a-z]+\b', context)
238
- if names:
239
- return f"The answer is {names[0]}."
240
- elif re.search(r'\bwhat (is|are|was|were)\b', question.lower()):
241
- first_sentence = re.search(r'^[^.!?]*[.!?]', context)
242
- if first_sentence:
243
- return first_sentence.group(0)
244
- elif re.search(r'\blist of\b', question.lower()):
245
- items = re.findall(r'^\s*[•*-]\s*.*', context, re.MULTILINE)
246
- if items:
247
- return "Some relevant items: " + ", ".join([item.strip()[2:] for item in items[:3]]) + "..."
248
- key_phrases = extract_key_phrases(question)
249
- if key_phrases:
 
250
  sentences = re.split(r'[.!?]', context)
 
 
251
  for sentence in sentences:
252
- if any(phrase.lower() in sentence.lower() for phrase in key_phrases):
253
- return sentence.strip() + "."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
254
  return None
255
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
256
  class AgentState(TypedDict, total=False):
257
  question: str
258
  is_reversed: bool
259
  is_python: bool
260
  is_riddle: bool
261
- use_tool: str
 
262
  response: str
263
-
 
264
  builder = StateGraph(AgentState)
265
  # --- Nodes ---
266
  builder.add_node("check_reversed", check_reversed)
@@ -274,87 +648,49 @@ class SuperSmartAgent:
274
  builder.add_node("check_python_suitability", check_python_suitability)
275
  builder.add_node("generate_code", generate_code)
276
  builder.add_node("fallback", fallback)
277
-
278
- # Entry
279
  builder.set_entry_point("check_reversed")
280
- # Edges
 
281
  builder.add_edge("check_reversed", "fix_question")
282
  builder.add_edge("fix_question", "check_riddle_or_trick")
283
-
284
  builder.add_conditional_edges(
285
  "check_riddle_or_trick",
286
  lambda s: "solve_riddle" if s.get("is_riddle") else "check_wikipedia_suitability"
287
  )
288
-
289
  builder.add_conditional_edges(
290
  "check_wikipedia_suitability",
291
  lambda s: "search_wikipedia" if s.get("is_wiki") else "check_reasoning_needed"
292
  )
293
-
294
  builder.add_conditional_edges(
295
  "check_reasoning_needed",
296
  lambda s: "general_reasoning_qa" if s.get("needs_reasoning") else "check_python_suitability"
297
  )
298
-
299
  builder.add_conditional_edges(
300
  "check_python_suitability",
301
  lambda s: "generate_code" if s.get("is_python") else "fallback"
302
  )
303
-
304
- # Ends
305
  builder.add_edge("solve_riddle", END)
306
  builder.add_edge("search_wikipedia", END)
307
  builder.add_edge("general_reasoning_qa", END)
308
  builder.add_edge("generate_code", END)
309
  builder.add_edge("fallback", END)
310
-
311
  graph = builder.compile()
312
  return graph
313
-
314
- def extract_answer(self, question, context):
315
- """Simple heuristic-based answer extraction"""
316
- # If question asks for a count (e.g., "how many")
317
- if re.search(r'\bhow many\b', question.lower()):
318
- numbers = re.findall(r'\d+', context)
319
- if numbers:
320
- return f"The answer is {numbers[0]}."
321
- # If question asks for a date/year (e.g., "when did")
322
- elif re.search(r'\bwhen (did|was|were)\b', question.lower()):
323
- years = re.findall(r'\b(19|20)\d{2}\b', context)
324
- if years:
325
- return f"The answer is {years[0]}."
326
- # If question asks for a name/person (e.g., "who is")
327
- elif re.search(r'\bwho (is|was)\b', question.lower()):
328
- names = re.findall(r'\b[A-Z][a-z]+ [A-Z][a-z]+\b', context)
329
- if names:
330
- return f"The answer is {names[0]}."
331
- # If question asks for a definition/explanation (e.g., "what is")
332
- elif re.search(r'\bwhat (is|are|was|were)\b', question.lower()):
333
- first_sentence = re.search(r'^[^.!?]*[.!?]', context)
334
- if first_sentence:
335
- return first_sentence.group(0)
336
- # If question asks for a list (e.g., "list of")
337
- elif re.search(r'\blist of\b', question.lower()):
338
- items = re.findall(r'^\s*[•*-]\s*.*', context, re.MULTILINE)
339
- if items:
340
- return "Some relevant items: " + ", ".join([item.strip()[2:] for item in items[:3]]) + "..."
341
- # Default case - return a relevant sentence containing question keywords
342
- key_phrases = extract_key_phrases(question)
343
- if key_phrases:
344
- sentences = re.split(r'[.!?]', context)
345
- for sentence in sentences:
346
- if any(phrase.lower() in sentence.lower() for phrase in key_phrases):
347
- return sentence.strip() + "."
348
- return None
349
-
350
-
351
-
352
  def __call__(self, question: str) -> str:
353
  state = {"question": question}
354
  result = self.graph.invoke(state)
355
  return result.get("response", "No answer generated.")
356
 
357
 
 
 
 
358
 
359
  ########################################
360
  def run_and_submit_all( profile: gr.OAuthProfile | None):
 
35
  # --- Constants ---
36
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
37
 
38
+
39
  class SuperSmartAgent:
40
  def __init__(self):
41
  self.graph = self._build_graph()
 
123
 
124
  def check_wikipedia_suitability(state):
125
  q = state["question"].lower()
126
+ triggers = [
127
+ "wikipedia", "who is", "what is", "when did", "where is",
128
+ "tell me about", "how many", "how much", "what was the",
129
+ "describe", "explain", "information about", "details about"
130
+ ]
131
  state["is_wiki"] = any(trigger in q for trigger in triggers)
132
  return state
133
 
134
  def search_wikipedia(state):
135
  question = state["question"]
136
  try:
 
137
  page_titles = wikipedia.search(question)
138
  if not page_titles:
139
  state["response"] = "No relevant Wikipedia article found."
 
145
  state["response"] = f"Error fetching Wikipedia content: {e}"
146
  return state
147
 
148
+ def get_relevant_context(self, question, search_results):
149
+ """
150
+ Get more relevant context by focusing on the most relevant page and sections.
151
+ """
152
+ if not search_results:
153
+ return ""
154
+
155
+ try:
156
+ title = search_results[0]
157
+ page = self.wiki_wiki.page(title)
158
+ if page.exists():
159
+ full_content = page.text
160
+
161
+ # Try to identify the most relevant sections based on question keywords
162
+ key_phrases = self.extract_key_phrases(question)
163
+
164
+ # Split content into sections (simplified approach)
165
+ sections = re.split(r'\n\s*\n', full_content)
166
+ relevant_sections = []
167
+
168
+ for section in sections:
169
+ # Check if section contains any of the key phrases
170
+ section_lower = section.lower()
171
+ if any(phrase.lower() in section_lower for phrase in key_phrases):
172
+ # Also check if section looks like it contains statistics or tables
173
+ if self.section_contains_statistics(section):
174
+ relevant_sections.insert(0, section) # Put more likely sections first
175
+ else:
176
+ relevant_sections.append(section)
177
+
178
+ if relevant_sections:
179
+ return "\n\n".join(relevant_sections)
180
+
181
+ return full_content[:10000] # Limit context size
182
+
183
+ except Exception as e:
184
+ print(f"Error processing page: {e}")
185
+ return ""
186
+
187
+ return ""
188
+
189
+ def section_contains_statistics(self, section):
190
+ """Determine if a section likely contains statistics."""
191
+ indicators = [
192
+ 'statistics', 'stats', 'season', 'player',
193
+ 'year', 'at bat', 'walk', 'home run', 'rbi',
194
+ 'era', '| Year', '| Player', '| AB', '| W'
195
+ ]
196
+ section_lower = section.lower()
197
+ return any(indicator.lower() in section_lower for indicator in indicators)
198
+
199
  def preprocess_context(context):
200
+ context = re.sub(r'$$\d+$$', '', context)
201
  context = re.sub(r'\s+', ' ', context).strip()
202
  context = re.sub(r'\{\|.*?\|\}', '', context, flags=re.DOTALL)
203
  return context
 
217
 
218
  def general_reasoning_qa(state):
219
  question = state["question"]
220
+
221
  try:
 
222
  search_results = wikipedia.search(question, results=3)
223
+ if not search_results:
224
+ state["response"] = "Sorry, I couldn't find relevant information."
225
+ return state
 
 
 
 
 
 
 
 
 
226
 
227
+ context = self.get_relevant_context(question, search_results)
228
  if not context:
229
  state["response"] = "Sorry, I couldn't find relevant information."
230
  return state
231
 
232
  # Preprocess the context
233
+ context = self.preprocess_context(context)
 
 
 
234
 
235
+ # Extract tables if available
236
+ tables = self.extract_tables_from_wikipedia(context)
 
 
 
 
 
 
 
 
 
237
 
238
+ # Use enhanced answer extraction
239
+ answer = self.extract_answer(question, context, tables)
240
 
 
 
241
  if answer:
242
  state["response"] = answer
243
  else:
244
  try:
245
+ first_page = self.wiki_wiki.page(search_results[0])
246
+ if first_page.exists():
247
+ summary = first_page.summary[:500] + "..."
248
+ state["response"] = f"I couldn't find a specific answer, but here's some relevant information: {summary}"
249
+ else:
250
+ state["response"] = "No relevant information found."
 
251
  except Exception as e:
252
  state["response"] = f"I couldn't find a specific answer in the available information."
253
  except Exception as e:
254
  state["response"] = f"An error occurred while searching for information: {str(e)}"
255
  return state
256
 
257
+ def extract_tables_from_wikipedia(self, content):
258
+ """
259
+ Extract tables from Wikipedia content.
260
+ """
261
+ tables = []
262
+
263
+ # Look for wiki markup tables
264
+ table_pattern = r'\{\|(.*?)\|\}', re.DOTALL
265
+ table_matches = re.findall(table_pattern, content)
266
+
267
+ for table_match in table_matches:
268
+ rows = re.split(r'\|\-', table_match)
269
+ clean_rows = []
270
+
271
+ for row in rows:
272
+ cells = re.split(r'\|\|', row)
273
+ clean_cells = []
274
+
275
+ for cell in cells:
276
+ cell = re.sub(r'\[\[([^|\]]+)(?:|[^\]]+)?\]\]', r'\1', cell)
277
+ cell = re.sub(r'<[^>]+>', '', cell)
278
+ cell = re.sub(r'{{\s*[^{}]+\s*}}', '', cell)
279
+ cell = re.sub(r'\s+', ' ', cell).strip()
280
+ clean_cells.append(cell)
281
+
282
+ if clean_cells:
283
+ clean_rows.append(clean_cells)
284
+
285
+ if clean_rows:
286
+ tables.append(clean_rows)
287
+
288
+ # Look for HTML tables
289
+ html_table_pattern = r'<table.*?</table>', re.DOTALL|re.IGNORECASE
290
+ html_table_matches = re.findall(html_table_pattern, content)
291
+
292
+ for table_match in html_table_matches:
293
+ rows = re.findall(r'<tr.*?</tr>', table_match, re.DOTALL|re.IGNORECASE)
294
+ clean_rows = []
295
+
296
+ for row in rows:
297
+ cells = re.findall(r'<t[dh].*?</t[dh]>', row, re.DOTALL|re.IGNORECASE)
298
+ clean_cells = []
299
+
300
+ for cell in cells:
301
+ cell = re.sub(r'<.*?>', '', cell)
302
+ cell = re.sub(r'\s+', ' ', cell).strip()
303
+ clean_cells.append(cell)
304
+
305
+ if clean_cells:
306
+ clean_rows.append(clean_cells)
307
+
308
+ if clean_rows:
309
+ tables.append(clean_rows)
310
+
311
+ return tables
312
+
313
+ def extract_answer(self, question, context, tables=None):
314
+ """
315
+ Enhanced general purpose answer extraction from text context.
316
+ """
317
+ if tables is None:
318
+ tables = []
319
+
320
+ question_lower = question.lower()
321
+ context_lower = context.lower()
322
+
323
+ # First try to detect what type of question it is
324
+ question_type = self.detect_question_type(question_lower)
325
+
326
+ # Extract all numbers from context with their surrounding text
327
+ number_contexts = []
328
+ for match in re.finditer(r'(\d[\d,]*\d*)', context):
329
+ start_pos = max(0, match.start() - 50)
330
+ end_pos = min(len(context), match.end() + 50)
331
+ surrounding_text = context[start_pos:end_pos]
332
+ number_contexts.append((match.group(1).replace(',', ''), surrounding_text))
333
+
334
+ # Extract all named entities
335
+ named_entities = self.extract_named_entities(context)
336
+
337
+ # Try to answer based on question type
338
+ if question_type in ["count", "how many"]:
339
+ # Look for numbers with relevant context
340
+ best_match = self.find_best_number_match(question_lower, number_contexts)
341
+ if best_match:
342
+ number, _ = best_match
343
+ return f"The answer is {number}."
344
+
345
+ # If no specific pattern matches, check tables for numeric answers
346
+ if tables:
347
+ table_answer = self.find_answer_in_tables(question, tables)
348
+ if table_answer:
349
+ return table_answer
350
+
351
+ elif question_type == "person":
352
+ if named_entities:
353
+ # Find the first person name that appears near relevant context
354
+ relevant_name = self.find_relevant_person(question_lower, context_lower, named_entities)
355
+ if relevant_name:
356
+ return f"The answer is {relevant_name}."
357
+
358
+ elif question_type == "date":
359
+ # Look for dates/years
360
  years = re.findall(r'\b(19|20)\d{2}\b', context)
361
+ date_patterns = [
362
+ r'\b\d{1,2}\s+(January|February|March|April|May|June|July|August|September|October|November|December)[\s,]\s*\d{4}\b',
363
+ r'\b\d{1,2}/\d{1,2}/\d{4}\b',
364
+ r'\b\d{1,2}-\d{1,2}-\d{4}\b',
365
+ r'\b\d{4}\b'
366
+ ]
367
+
368
+ for pattern in date_patterns:
369
+ matches = re.findall(pattern, context)
370
+ if matches:
371
+ if isinstance(matches[0], tuple):
372
+ return f"The answer is {matches[0][0]} {matches[0][1]}."
373
+ else:
374
+ return f"The answer is {matches[0]}."
375
+
376
+ # For other question types, try to find the most relevant sentence
377
+ if question_keywords := self.extract_key_phrases(question):
378
  sentences = re.split(r'[.!?]', context)
379
+ scored_sentences = []
380
+
381
  for sentence in sentences:
382
+ sentence = sentence.strip()
383
+ if not sentence:
384
+ continue
385
+
386
+ # Score based on question keyword matches
387
+ score = sum(1 for keyword in question_keywords if keyword.lower() in sentence.lower())
388
+ if score > 0:
389
+ scored_sentences.append((score, sentence))
390
+
391
+ if scored_sentences:
392
+ # Sort by score descending, then by length descending
393
+ scored_sentences.sort(key=lambda x: (-x[0], -len(x[1])))
394
+ best_sentence = scored_sentences[0][1]
395
+
396
+ # Try to extract a more concise answer
397
+ number_match = re.search(r'(\d[\d,]*\d*)', best_sentence)
398
+ if number_match and "how many" in question_type:
399
+ start_idx = max(0, number_match.start() - 30)
400
+ end_idx = min(len(best_sentence), number_match.end() + 30)
401
+ relevant_part = best_sentence[start_idx:end_idx].strip()
402
+ if relevant_part.endswith('.'):
403
+ return relevant_part
404
+ return relevant_part + "."
405
+
406
+ # Fall back to full sentence
407
+ if best_sentence.endswith('.'):
408
+ return best_sentence
409
+ return best_sentence + "."
410
+
411
  return None
412
 
413
+ def detect_question_type(self, question):
414
+ """Classify the type of question for general processing."""
415
+ if re.search(r'\bhow many\b|\bhow much\b|\bwhat was the\s+\w+\s+of\b', question):
416
+ return "count"
417
+ elif re.search(r'\bwho is\b|\bwho was\b|\bwhich person\b|\bwhich player\b', question):
418
+ return "person"
419
+ elif re.search(r'\bwhen did\b|\bwhen was\b|\bwhat year\b|\bwhat date\b', question):
420
+ return "date"
421
+ elif re.search(r'\bwhat is\b|\bwhat was\b|\bwhat are\b|\bwhat were\b', question):
422
+ return "definition"
423
+ elif re.search(r'\bwhere is\b|\bwhere was\b|\bwhat location\b', question):
424
+ return "location"
425
+ elif re.search(r'\blist of\b|\blist the\b|\bgive me a list of\b', question):
426
+ return "list"
427
+ else:
428
+ return "general"
429
+
430
+ def find_best_number_match(self, question, number_contexts):
431
+ """Find the number from context that best matches the question."""
432
+ if not number_contexts:
433
+ return None
434
+
435
+ question_keywords = self.extract_key_phrases(question)
436
+ scored_numbers = []
437
+
438
+ for number, context in number_contexts:
439
+ context_lower = context.lower()
440
+ score = 0
441
+
442
+ # Score based on question keyword presence in context
443
+ for keyword in question_keywords:
444
+ if keyword.lower() in context_lower:
445
+ score += 1
446
+
447
+ # Score based on proximity of keywords to the number
448
+ number_pos = context_lower.find(number.lower())
449
+ if number_pos != -1:
450
+ for keyword in question_keywords:
451
+ keyword_positions = [m.start() for m in re.finditer(re.escape(keyword.lower()), context_lower)]
452
+ for pos in keyword_positions:
453
+ distance = abs(number_pos - pos)
454
+ score += max(0, 10 - distance/10) # Higher score for closer keywords
455
+
456
+ # Small boost for numbers appearing earlier in the document
457
+ score += (10000 - len(context)) / 10000 # Earlier numbers get slightly higher scores
458
+
459
+ scored_numbers.append((score, number, context))
460
+
461
+ if not scored_numbers:
462
+ return None
463
+
464
+ # Return the highest scoring number and its context
465
+ scored_numbers.sort(reverse=True, key=lambda x: x[0])
466
+ return (scored_numbers[0][1], scored_numbers[0][2])
467
+
468
+ def extract_named_entities(self, text):
469
+ """Extract named entities (people, places, etc.) from text."""
470
+ sentences = re.split(r'[.!?]', text)
471
+ entities = set()
472
+
473
+ for sentence in sentences:
474
+ tokens = re.findall(r'\b\w+\b', sentence)
475
+
476
+ # Skip first word if capitalized (likely start of sentence)
477
+ if len(tokens) > 0 and tokens[0][0].isupper():
478
+ tokens = tokens[1:]
479
+
480
+ # Find sequences of capitalized words (likely proper nouns)
481
+ i = 0
482
+ while i < len(tokens):
483
+ if tokens[i][0].isupper():
484
+ start = i
485
+ while i < len(tokens) and tokens[i][0].isupper():
486
+ i += 1
487
+ entity = ' '.join(tokens[start:i])
488
+ if len(entity.split()) >= 2 or len(entity) > 10:
489
+ entities.add(entity)
490
+ else:
491
+ i += 1
492
+
493
+ # Look for titles like Dr., Mr., etc.
494
+ title_pattern = r'\b(Dr|Mr|Ms|Mrs|Prof|Sr|Jr|Rev|Gen|Col|Maj|Lt|Sgt|Capt)\.\s+[A-Z][a-z]+'
495
+ for match in re.finditer(title_pattern, text, re.IGNORECASE):
496
+ full_match = match.group(0)
497
+ # Try to get the full name by including following capitalized words
498
+ remaining_text = text[match.end():]
499
+ remaining_words = re.findall(r'\b\w+\b', remaining_text)
500
+ full_entity = full_match
501
+ j = 0
502
+ while j < len(remaining_words) and remaining_words[j][0].isupper():
503
+ full_entity += ' ' + remaining_words[j]
504
+ j += 1
505
+ if full_entity:
506
+ entities.add(full_entity.replace('. ', ' ').strip())
507
+
508
+ return list(entities)
509
+
510
+ def find_relevant_person(self, question, context, entities):
511
+ """Find the most relevant person entity based on question context."""
512
+ if not entities:
513
+ return None
514
+
515
+ question_keywords = self.extract_key_phrases(question)
516
+ best_score = -1
517
+ best_entity = None
518
+
519
+ for entity in entities:
520
+ score = 0
521
+ entity_lower = entity.lower()
522
+
523
+ # Check if entity appears in context near question keywords
524
+ entity_positions = [m.start() for m in re.finditer(re.escape(entity), context, re.IGNORECASE)]
525
+
526
+ for pos in entity_positions:
527
+ # Check surrounding context for question keywords
528
+ window_start = max(0, pos - 50)
529
+ window_end = min(len(context), pos + len(entity) + 50)
530
+ window_text = context[window_start:window_end]
531
+
532
+ # Count keyword matches in window
533
+ keyword_matches = sum(1 for keyword in question_keywords
534
+ if keyword.lower() in window_text.lower())
535
+ score += keyword_matches
536
+
537
+ # If this entity has a higher score, select it
538
+ if score > best_score:
539
+ best_score = score
540
+ best_entity = entity
541
+
542
+ return best_entity
543
+
544
+ def find_answer_in_tables(self, question, tables):
545
+ """
546
+ Search through extracted tables to find an answer to the question.
547
+ """
548
+ if not tables:
549
+ return None
550
+
551
+ key_phrases = self.extract_key_phrases(question)
552
+ question_lower = question.lower()
553
+
554
+ for table in tables:
555
+ # Check if table is relevant to the question
556
+ table_is_relevant = False
557
+
558
+ # Check headers and body for keywords
559
+ all_text = []
560
+ if len(table) > 0:
561
+ headers = table[0]
562
+ all_text.extend(headers)
563
+ if len(table) > 1:
564
+ body_text = ' '.join([' '.join(row) for row in table[1:]])
565
+ all_text.extend(body_text.split())
566
+
567
+ all_text_lower = ' '.join(all_text).lower()
568
+ table_is_relevant = any(phrase.lower() in all_text_lower for phrase in key_phrases)
569
+
570
+ if not table_is_relevant:
571
+ continue
572
+
573
+ # Determine column types
574
+ column_types = self.detect_column_types(table)
575
+
576
+ # Handle different question types based on column types
577
+ if "how many" in question_lower or "what was the" in question_lower:
578
+ numeric_columns = [i for i, col_type in enumerate(column_types)
579
+ if col_type == 'number']
580
+
581
+ if numeric_columns and len(table) > 1:
582
+ # Find rows that match question keywords
583
+ relevant_rows = []
584
+ for row in table[1:]: # Skip header row
585
+ row_text = ' '.join(row).lower()
586
+ if any(phrase.lower() in row_text for phrase in key_phrases):
587
+ relevant_rows.append(row)
588
+
589
+ if relevant_rows:
590
+ # For each numeric column, collect the numbers from relevant rows
591
+ number_candidates = []
592
+ for row in relevant_rows:
593
+ for col_idx in numeric_columns:
594
+ if col_idx < len(row):
595
+ cell = row[col_idx]
596
+ numbers = re.findall(r'\d[\d,]*\d*', cell)
597
+ for num in numbers:
598
+ num_clean = num.replace(',', '')
599
+ if num_clean.isdigit():
600
+ number_candidates.append((int(num_clean), row))
601
+
602
+ if number_candidates:
603
+ # Return the first number found in relevant rows
604
+ first_num = number_candidates[0][0]
605
+ return f"The answer is {first_num}."
606
+
607
+ elif "who" in question_lower or "which person" in question_lower:
608
+ # Try to identify name columns
609
+ name_columns = []
610
+ for i, col_type in enumerate(column_types):
611
+ if col_type == 'name' and len(table) > 1:
612
+ # Check if this column looks like names
613
+ sample_values = [row[i] for row in table[1:min(5, len(table))]]
614
+ if self.column_looks_like_names(sample_values):
615
+ name_columns.append(i)
616
+
617
+ if name_columns:
618
+ relevant_rows = []
619
+ for row in table[1:]:
620
+ row_text = ' '.join(row).lower()
621
+ if any(phrase.lower() in row_text for phrase in key_phrases):
622
+ relevant_rows.append(row
623
+
624
+
625
+
626
+
627
+
628
  class AgentState(TypedDict, total=False):
629
  question: str
630
  is_reversed: bool
631
  is_python: bool
632
  is_riddle: bool
633
+ is_wiki: bool # Added for Wikipedia suitability check
634
+ needs_reasoning: bool # Added for reasoning check
635
  response: str
636
+ use_tool: str # Keep this if it's being used elsewhere
637
+
638
  builder = StateGraph(AgentState)
639
  # --- Nodes ---
640
  builder.add_node("check_reversed", check_reversed)
 
648
  builder.add_node("check_python_suitability", check_python_suitability)
649
  builder.add_node("generate_code", generate_code)
650
  builder.add_node("fallback", fallback)
651
+
652
+ # Entry point remains the same
653
  builder.set_entry_point("check_reversed")
654
+
655
+ # Edges - updated to match your current workflow
656
  builder.add_edge("check_reversed", "fix_question")
657
  builder.add_edge("fix_question", "check_riddle_or_trick")
 
658
  builder.add_conditional_edges(
659
  "check_riddle_or_trick",
660
  lambda s: "solve_riddle" if s.get("is_riddle") else "check_wikipedia_suitability"
661
  )
 
662
  builder.add_conditional_edges(
663
  "check_wikipedia_suitability",
664
  lambda s: "search_wikipedia" if s.get("is_wiki") else "check_reasoning_needed"
665
  )
 
666
  builder.add_conditional_edges(
667
  "check_reasoning_needed",
668
  lambda s: "general_reasoning_qa" if s.get("needs_reasoning") else "check_python_suitability"
669
  )
 
670
  builder.add_conditional_edges(
671
  "check_python_suitability",
672
  lambda s: "generate_code" if s.get("is_python") else "fallback"
673
  )
674
+
675
+ # Ending edges
676
  builder.add_edge("solve_riddle", END)
677
  builder.add_edge("search_wikipedia", END)
678
  builder.add_edge("general_reasoning_qa", END)
679
  builder.add_edge("generate_code", END)
680
  builder.add_edge("fallback", END)
681
+
682
  graph = builder.compile()
683
  return graph
684
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
685
  def __call__(self, question: str) -> str:
686
  state = {"question": question}
687
  result = self.graph.invoke(state)
688
  return result.get("response", "No answer generated.")
689
 
690
 
691
+
692
+
693
+
694
 
695
  ########################################
696
  def run_and_submit_all( profile: gr.OAuthProfile | None):