thinhbtt commited on
Commit
0fe4160
·
verified ·
1 Parent(s): a39dddb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +223 -87
app.py CHANGED
@@ -222,125 +222,261 @@ def youtube_oembed_title_desc(url):
222
  # ---
223
  # Agent
224
  # ---
 
225
  class BasicAgent:
226
  def __init__(self):
227
- print("Level-2 Rule Agent initialized (wiki + file tools).")
228
  self.api_url = DEFAULT_API_URL
229
 
230
- def solve_math(self, text):
231
- # Fixed regex pattern
232
- expr = re.findall(r"[-+]?\d+\.?\d*|[\+\-\*\/]", text)
233
- # if pattern like "What is 12 + 5?" or "12 + 5 = ?"
234
- if len(expr) >= 3:
235
- try:
236
- # join tokens but ensure it's a safe expression (only digits and ops)
237
- safe = "".join(expr)
238
- # limit length
239
- if len(safe) < 100:
240
- res = eval(safe)
241
- if isinstance(res, float) and res.is_integer():
242
- res = int(res)
243
- return str(res)
244
- except Exception:
245
- return None
246
  return None
247
 
248
- def solve_counting(self, text):
249
- # detect patterns like 'how many characters in "..."' or 'How many words in "..."'
250
- m = re.search(r'how many characters in\s*"(.*?)"', text, re.I)
 
251
  if m:
252
- return str(len(m.group(1)))
253
- m2 = re.search(r'how many words in\s*"(.*?)"', text, re.I)
 
254
  if m2:
255
- return str(len(m2.group(1).split()))
256
- # generic quoted count
257
- m3 = re.search(r'"(.*?)"', text)
258
- if m3 and ("characters" in text.lower() or "how many" in text.lower()):
259
- return str(len(m3.group(1)))
 
 
 
 
 
 
 
 
260
  return None
261
 
262
- def solve_simple_facts(self, text):
263
- t = text.lower()
264
- # trivial mappings
265
- if "capital of france" in t:
266
- return "Paris"
267
- if "capital of japan" in t:
268
- return "Tokyo"
269
- if "pi to 2 decimals" in t or "pi to 2 decimal" in t:
270
- return "3.14"
271
- # reversed sentence detection (some GAIA items)
272
- if re.search(r'\bfi\b$', text.strip(), re.I) or ("reverse" in t and "text" in t):
273
- # try a simple reverse of words if that seems to be asked
274
- words = text.split()
275
- return " ".join(w[::-1] for w in words)
276
  return None
277
 
278
- def solve_with_wikipedia(self, question, task_id=None):
279
- """Try to use wiki heuristics to find numeric answers."""
280
- try:
281
- # If the question contains a specific URL to analyze (youtube), handle that
282
- m = re.search(r'https?://[^\s]+', question)
283
- if m:
284
- url = m.group(0)
285
- # youtube special handling
286
- if "youtube.com" in url or "youtu.be" in url:
287
- txt = youtube_oembed_title_desc(url)
288
- if txt:
289
- nums = extract_numbers(txt)
290
- if nums:
291
- return nums[0]
292
- # if other URL, try fetching content
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
293
  try:
294
- r = requests.get(url, headers=USER_AGENT, timeout=10)
295
- r.raise_for_status()
296
- page_text = r.text
297
- nums = extract_numbers(page_text)
298
- if nums:
299
- return nums[0]
300
- except Exception:
301
  pass
 
 
 
 
 
 
 
 
302
 
303
- # try to fetch possible file text via dataset files route (some tasks have attachments)
304
- if task_id:
305
- file_text = fetch_file_text(self.api_url, task_id)
306
- if file_text:
307
- n = extract_numbers(file_text)
308
- if n:
309
- return n[0]
310
- # else attempt basic string match
311
- return None
312
-
313
- # else try wikipedia search heuristics
314
- candidate = wiki_try_find_number(question)
315
- if candidate:
316
- return candidate
317
- except Exception:
 
 
 
 
 
 
318
  return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
319
  return None
320
 
 
321
  def __call__(self, question: str, task_id: str = None) -> str:
322
  q = question or ""
323
- print("Solving question:", q[:80].replace("\n", " ") + "...")
324
- # 1. math
 
 
 
 
 
 
 
 
 
 
 
 
325
  ans = self.solve_math(q)
326
  if ans:
 
 
 
327
  return ans
328
- # 2. counting
 
329
  ans = self.solve_counting(q)
330
  if ans:
331
  return ans
332
- # 3. trivial facts / simple patterns
 
333
  ans = self.solve_simple_facts(q)
334
  if ans:
335
  return ans
336
- # 4. wikipedia / files / url heuristics
 
337
  ans = self.solve_with_wikipedia(q, task_id=task_id)
338
  if ans:
339
- return ans
340
- # fallback
341
- return "unknown"
 
 
 
 
 
 
 
 
 
 
342
 
343
- # ---
344
  # Submission runner
345
  # ---
346
  def run_and_submit_all(profile: gr.OAuthProfile | None):
 
222
  # ---
223
  # Agent
224
  # ---
225
+ # Replace the existing BasicAgent with this improved version
226
  class BasicAgent:
227
  def __init__(self):
228
+ print("Level-2 Rule Agent v2 initialized (wiki + file tools + album parser).")
229
  self.api_url = DEFAULT_API_URL
230
 
231
+ # helper: get two years from text if present
232
+ def parse_year_range(self, text):
233
+ years = re.findall(r"\b(19|20)\d{2}\b", text)
234
+ # the regex above only returns first two digits groups; use full match instead:
235
+ years_full = re.findall(r"\b(19|20)\d{2}\b", text)
236
+ years_all = re.findall(r"\b(19|20)\d{2}\b", text)
237
+ # better approach:
238
+ years_all = re.findall(r"\b(19|20)\d{2}\b", text)
239
+ # But actually need whole match; use different pattern:
240
+ years_all = re.findall(r"\b(19|20)\d{2}\b", text)
241
+ # Simpler: use full-year regex:
242
+ years_all = re.findall(r"\b(?:19|20)\d{2}\b", text)
243
+ if len(years_all) >= 2:
244
+ y1 = int(years_all[0])
245
+ y2 = int(years_all[1])
246
+ return min(y1, y2), max(y1, y2)
247
  return None
248
 
249
+ # helper: find artist/name between "by <name> between"
250
+ def parse_artist_from_question(self, q):
251
+ # try pattern: "by <name> between"
252
+ m = re.search(r"by\s+(.+?)\s+between", q, re.I)
253
  if m:
254
+ return m.group(1).strip()
255
+ # try "by <name> from"
256
+ m2 = re.search(r"by\s+(.+?)\s+from", q, re.I)
257
  if m2:
258
+ return m2.group(1).strip()
259
+ # fallback: find "by <name>." end of sentence
260
+ m3 = re.search(r"by\s+(.+?)(?:\?|\.$)", q, re.I)
261
+ if m3:
262
+ return m3.group(1).strip()
263
+ # last fallback: try "How many studio albums were published by X" -> capture after 'by'
264
+ m4 = re.search(r"by\s+(.+)", q, re.I)
265
+ if m4:
266
+ # trim trailing phrases like between...
267
+ txt = m4.group(1)
268
+ txt = re.sub(r"\s+between.*", "", txt, flags=re.I).strip()
269
+ txt = re.sub(r"\s+in.*", "", txt, flags=re.I).strip()
270
+ return txt
271
  return None
272
 
273
+ def wiki_get_page_html(self, title):
274
+ """Return HTML of a wikipedia page (mobile or desktop) using /w/index.php?title=...&printable=yes"""
275
+ try:
276
+ url = f"https://en.wikipedia.org/wiki/{title.replace(' ', '_')}"
277
+ r = requests.get(url, headers=USER_AGENT, timeout=10)
278
+ if r.status_code == 200:
279
+ return r.text
280
+ except Exception:
281
+ pass
 
 
 
 
 
282
  return None
283
 
284
+ def extract_studio_section_text(self, page_html):
285
+ """Try to extract the 'Studio albums' section from wikipedia HTML using simple markers."""
286
+ if not page_html:
287
+ return ""
288
+ # if BeautifulSoup available, use it
289
+ if BeautifulSoup is not None:
290
+ try:
291
+ soup = BeautifulSoup(page_html, "html.parser")
292
+ # find header elements that contain "Studio albums" or "Discography"
293
+ headers = soup.find_all(['h2', 'h3', 'h4'])
294
+ target = None
295
+ for h in headers:
296
+ header_text = h.get_text(" ").strip().lower()
297
+ if "studio album" in header_text or "discography" in header_text:
298
+ # collect sibling text until next header of same level
299
+ parts = []
300
+ sib = h.next_sibling
301
+ # gather paragraphs and lists
302
+ while sib:
303
+ # stop at next header
304
+ if getattr(sib, 'name', None) in ['h2','h3','h4']:
305
+ break
306
+ parts.append(getattr(sib, 'get_text', lambda: str(sib))())
307
+ sib = sib.next_sibling
308
+ target = "\n".join(parts)
309
+ if target:
310
+ return target
311
+ except Exception:
312
+ pass
313
+ # fallback: try crude string search for "Studio albums" marker
314
+ low = page_html.lower()
315
+ idx = low.find("studio albums")
316
+ if idx == -1:
317
+ idx = low.find("discography")
318
+ if idx == -1:
319
+ # return whole page
320
+ return page_html
321
+ # take chunk after idx
322
+ chunk = page_html[idx: idx + 12000] # large slice
323
+ return chunk
324
+
325
+ def count_albums_between(self, page_html, y_min, y_max):
326
+ """From a page chunk try to extract years and count how many album entries fall into [y_min,y_max]."""
327
+ if not page_html:
328
+ return None
329
+ # extract lines that likely contain years
330
+ text = re.sub(r"<[^>]+>", " ", page_html) # drop tags crudely
331
+ # look for year patterns like (2001), 2001, 2001–2002 etc
332
+ matches = re.findall(r"(?:\b(?:19|20)\d{2}\b(?:\s*(?:–|-|to)\s*\b(?:19|20)\d{2}\b)?)", text)
333
+ years = []
334
+ for m in matches:
335
+ # for ranges like 2001–2003, take start year
336
+ sub = re.findall(r"(?:\b(?:19|20)\d{2}\b)", m)
337
+ for s in sub:
338
  try:
339
+ years.append(int(s))
340
+ except:
 
 
 
 
 
341
  pass
342
+ if not years:
343
+ return None
344
+ # Now count how many distinct album entries fall into range.
345
+ # crude approach: count number of year occurrences within range
346
+ count = sum(1 for y in years if y_min <= y <= y_max)
347
+ if count == 0:
348
+ return None
349
+ return count
350
 
351
+ def solve_studio_albums_between(self, question):
352
+ # detect if question asks about studio albums between years
353
+ if "studio album" not in question.lower():
354
+ return None
355
+ # parse years
356
+ yr = self.parse_year_range(question)
357
+ if not yr:
358
+ return None
359
+ y_min, y_max = yr
360
+ # parse artist
361
+ artist = self.parse_artist_from_question(question)
362
+ if not artist:
363
+ # try to remove the beginning e.g., "How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)?"
364
+ m = re.search(r"how many studio albums .* by (.*?) between", question, re.I)
365
+ if m:
366
+ artist = m.group(1).strip()
367
+ if not artist:
368
+ return None
369
+ # search wiki for artist page
370
+ title = wikipedia_search_first_page(artist)
371
+ if not title:
372
  return None
373
+ # get page html
374
+ page_html = self.wiki_get_page_html(title)
375
+ if not page_html:
376
+ # try extract text
377
+ extract = wikipedia_get_extract(title)
378
+ # fallback to finding years in extract
379
+ if extract:
380
+ yrs = re.findall(r"\b(?:19|20)\d{2}\b", extract)
381
+ yrs = [int(x) for x in yrs]
382
+ cnt = sum(1 for y in yrs if y_min <= y <= y_max)
383
+ if cnt:
384
+ return str(cnt)
385
+ return None
386
+ # extract studio section
387
+ sec = self.extract_studio_section_text(page_html)
388
+ if not sec:
389
+ return None
390
+ cnt = self.count_albums_between(sec, y_min, y_max)
391
+ if cnt is not None:
392
+ return str(cnt)
393
+ # last attempt: search whole-page extract from API for numbers near 'studio album' phrase
394
+ extract = wikipedia_get_extract(title)
395
+ if extract:
396
+ # find sentences mentioning studio album
397
+ sents = re.split(r'(?<=[\.\?\!])\s+', extract)
398
+ for s in sents:
399
+ if "studio album" in s.lower():
400
+ nums = re.findall(r"\b(?:19|20)\d{2}\b", s)
401
+ nums = [int(x) for x in nums]
402
+ cnt = sum(1 for y in nums if y_min <= y <= y_max)
403
+ if cnt:
404
+ return str(cnt)
405
+ return None
406
+
407
+ # slightly improved reverse detection: if question asks to reverse or contains many reversed words
408
+ def detect_and_reverse_text(self, question):
409
+ t = question.strip()
410
+ if "reverse" in t.lower() or "reversed" in t.lower():
411
+ # probably instruction; find quoted text and reverse chars
412
+ m = re.search(r'"(.*?)"', t)
413
+ if m:
414
+ inner = m.group(1)
415
+ return inner[::-1] # reverse characters for exact match
416
+ # else try to find long string after colon
417
+ m2 = re.search(r':\s*(.+)', t)
418
+ if m2:
419
+ return m2.group(1).strip()[::-1]
420
+ # also handle case where the question itself looks reversed: many tokens are non-words
421
+ words = t.split()
422
+ reversed_like = sum(1 for w in words if re.search(r'[a-z]{2,}', w[::-1]))
423
+ # crude: if more than half words are non-English-looking, try reversing entire string
424
+ if len(words) > 4 and reversed_like > len(words) // 2:
425
+ return t[::-1]
426
  return None
427
 
428
+ # fallback: reuse prior solve_* and wiki heuristics
429
  def __call__(self, question: str, task_id: str = None) -> str:
430
  q = question or ""
431
+ q = q.strip()
432
+ print("Solving (v2):", q[:120].replace("\n", " ") + "...")
433
+
434
+ # 0. reverse instructions
435
+ rev = self.detect_and_reverse_text(q)
436
+ if rev:
437
+ return rev
438
+
439
+ # 1. album-specific
440
+ album_ans = self.solve_studio_albums_between(q)
441
+ if album_ans:
442
+ return album_ans
443
+
444
+ # 2. math
445
  ans = self.solve_math(q)
446
  if ans:
447
+ # normalize numeric strings: remove .0
448
+ if re.match(r"^\d+\.0+$", ans):
449
+ ans = str(int(float(ans)))
450
  return ans
451
+
452
+ # 3. counting
453
  ans = self.solve_counting(q)
454
  if ans:
455
  return ans
456
+
457
+ # 4. simple facts
458
  ans = self.solve_simple_facts(q)
459
  if ans:
460
  return ans
461
+
462
+ # 5. wikipedia numeric heuristic
463
  ans = self.solve_with_wikipedia(q, task_id=task_id)
464
  if ans:
465
+ # return first numeric token normalized
466
+ if isinstance(ans, (int, float)):
467
+ return str(ans)
468
+ s = str(ans)
469
+ # strip decimals like '15.0' -> '15'
470
+ if re.match(r"^\d+\.\d+$", s):
471
+ try:
472
+ f = float(s)
473
+ if f.is_integer():
474
+ return str(int(f))
475
+ except:
476
+ pass
477
+ return s
478
 
479
+ return "unknown"
480
  # Submission runner
481
  # ---
482
  def run_and_submit_all(profile: gr.OAuthProfile | None):