Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -222,125 +222,261 @@ def youtube_oembed_title_desc(url):
|
|
| 222 |
# ---
|
| 223 |
# Agent
|
| 224 |
# ---
|
|
|
|
| 225 |
class BasicAgent:
|
| 226 |
def __init__(self):
|
| 227 |
-
print("Level-2 Rule Agent initialized (wiki + file tools).")
|
| 228 |
self.api_url = DEFAULT_API_URL
|
| 229 |
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
#
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
return None
|
| 247 |
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
|
|
|
| 251 |
if m:
|
| 252 |
-
return
|
| 253 |
-
|
|
|
|
| 254 |
if m2:
|
| 255 |
-
return
|
| 256 |
-
#
|
| 257 |
-
m3 = re.search(r
|
| 258 |
-
if m3
|
| 259 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 260 |
return None
|
| 261 |
|
| 262 |
-
def
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
# reversed sentence detection (some GAIA items)
|
| 272 |
-
if re.search(r'\bfi\b$', text.strip(), re.I) or ("reverse" in t and "text" in t):
|
| 273 |
-
# try a simple reverse of words if that seems to be asked
|
| 274 |
-
words = text.split()
|
| 275 |
-
return " ".join(w[::-1] for w in words)
|
| 276 |
return None
|
| 277 |
|
| 278 |
-
def
|
| 279 |
-
"""Try to
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
|
| 292 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 293 |
try:
|
| 294 |
-
|
| 295 |
-
|
| 296 |
-
page_text = r.text
|
| 297 |
-
nums = extract_numbers(page_text)
|
| 298 |
-
if nums:
|
| 299 |
-
return nums[0]
|
| 300 |
-
except Exception:
|
| 301 |
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 302 |
|
| 303 |
-
|
| 304 |
-
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
|
| 310 |
-
|
| 311 |
-
|
| 312 |
-
|
| 313 |
-
|
| 314 |
-
|
| 315 |
-
|
| 316 |
-
|
| 317 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 318 |
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 319 |
return None
|
| 320 |
|
|
|
|
| 321 |
def __call__(self, question: str, task_id: str = None) -> str:
|
| 322 |
q = question or ""
|
| 323 |
-
|
| 324 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 325 |
ans = self.solve_math(q)
|
| 326 |
if ans:
|
|
|
|
|
|
|
|
|
|
| 327 |
return ans
|
| 328 |
-
|
|
|
|
| 329 |
ans = self.solve_counting(q)
|
| 330 |
if ans:
|
| 331 |
return ans
|
| 332 |
-
|
|
|
|
| 333 |
ans = self.solve_simple_facts(q)
|
| 334 |
if ans:
|
| 335 |
return ans
|
| 336 |
-
|
|
|
|
| 337 |
ans = self.solve_with_wikipedia(q, task_id=task_id)
|
| 338 |
if ans:
|
| 339 |
-
return
|
| 340 |
-
|
| 341 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 342 |
|
| 343 |
-
|
| 344 |
# Submission runner
|
| 345 |
# ---
|
| 346 |
def run_and_submit_all(profile: gr.OAuthProfile | None):
|
|
|
|
| 222 |
# ---
|
| 223 |
# Agent
|
| 224 |
# ---
|
| 225 |
+
# Replace the existing BasicAgent with this improved version
|
| 226 |
class BasicAgent:
|
| 227 |
def __init__(self):
|
| 228 |
+
print("Level-2 Rule Agent v2 initialized (wiki + file tools + album parser).")
|
| 229 |
self.api_url = DEFAULT_API_URL
|
| 230 |
|
| 231 |
+
# helper: get two years from text if present
|
| 232 |
+
def parse_year_range(self, text):
|
| 233 |
+
years = re.findall(r"\b(19|20)\d{2}\b", text)
|
| 234 |
+
# the regex above only returns first two digits groups; use full match instead:
|
| 235 |
+
years_full = re.findall(r"\b(19|20)\d{2}\b", text)
|
| 236 |
+
years_all = re.findall(r"\b(19|20)\d{2}\b", text)
|
| 237 |
+
# better approach:
|
| 238 |
+
years_all = re.findall(r"\b(19|20)\d{2}\b", text)
|
| 239 |
+
# But actually need whole match; use different pattern:
|
| 240 |
+
years_all = re.findall(r"\b(19|20)\d{2}\b", text)
|
| 241 |
+
# Simpler: use full-year regex:
|
| 242 |
+
years_all = re.findall(r"\b(?:19|20)\d{2}\b", text)
|
| 243 |
+
if len(years_all) >= 2:
|
| 244 |
+
y1 = int(years_all[0])
|
| 245 |
+
y2 = int(years_all[1])
|
| 246 |
+
return min(y1, y2), max(y1, y2)
|
| 247 |
return None
|
| 248 |
|
| 249 |
+
# helper: find artist/name between "by <name> between"
|
| 250 |
+
def parse_artist_from_question(self, q):
|
| 251 |
+
# try pattern: "by <name> between"
|
| 252 |
+
m = re.search(r"by\s+(.+?)\s+between", q, re.I)
|
| 253 |
if m:
|
| 254 |
+
return m.group(1).strip()
|
| 255 |
+
# try "by <name> from"
|
| 256 |
+
m2 = re.search(r"by\s+(.+?)\s+from", q, re.I)
|
| 257 |
if m2:
|
| 258 |
+
return m2.group(1).strip()
|
| 259 |
+
# fallback: find "by <name>." end of sentence
|
| 260 |
+
m3 = re.search(r"by\s+(.+?)(?:\?|\.$)", q, re.I)
|
| 261 |
+
if m3:
|
| 262 |
+
return m3.group(1).strip()
|
| 263 |
+
# last fallback: try "How many studio albums were published by X" -> capture after 'by'
|
| 264 |
+
m4 = re.search(r"by\s+(.+)", q, re.I)
|
| 265 |
+
if m4:
|
| 266 |
+
# trim trailing phrases like between...
|
| 267 |
+
txt = m4.group(1)
|
| 268 |
+
txt = re.sub(r"\s+between.*", "", txt, flags=re.I).strip()
|
| 269 |
+
txt = re.sub(r"\s+in.*", "", txt, flags=re.I).strip()
|
| 270 |
+
return txt
|
| 271 |
return None
|
| 272 |
|
| 273 |
+
def wiki_get_page_html(self, title):
|
| 274 |
+
"""Return HTML of a wikipedia page (mobile or desktop) using /w/index.php?title=...&printable=yes"""
|
| 275 |
+
try:
|
| 276 |
+
url = f"https://en.wikipedia.org/wiki/{title.replace(' ', '_')}"
|
| 277 |
+
r = requests.get(url, headers=USER_AGENT, timeout=10)
|
| 278 |
+
if r.status_code == 200:
|
| 279 |
+
return r.text
|
| 280 |
+
except Exception:
|
| 281 |
+
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 282 |
return None
|
| 283 |
|
| 284 |
+
def extract_studio_section_text(self, page_html):
|
| 285 |
+
"""Try to extract the 'Studio albums' section from wikipedia HTML using simple markers."""
|
| 286 |
+
if not page_html:
|
| 287 |
+
return ""
|
| 288 |
+
# if BeautifulSoup available, use it
|
| 289 |
+
if BeautifulSoup is not None:
|
| 290 |
+
try:
|
| 291 |
+
soup = BeautifulSoup(page_html, "html.parser")
|
| 292 |
+
# find header elements that contain "Studio albums" or "Discography"
|
| 293 |
+
headers = soup.find_all(['h2', 'h3', 'h4'])
|
| 294 |
+
target = None
|
| 295 |
+
for h in headers:
|
| 296 |
+
header_text = h.get_text(" ").strip().lower()
|
| 297 |
+
if "studio album" in header_text or "discography" in header_text:
|
| 298 |
+
# collect sibling text until next header of same level
|
| 299 |
+
parts = []
|
| 300 |
+
sib = h.next_sibling
|
| 301 |
+
# gather paragraphs and lists
|
| 302 |
+
while sib:
|
| 303 |
+
# stop at next header
|
| 304 |
+
if getattr(sib, 'name', None) in ['h2','h3','h4']:
|
| 305 |
+
break
|
| 306 |
+
parts.append(getattr(sib, 'get_text', lambda: str(sib))())
|
| 307 |
+
sib = sib.next_sibling
|
| 308 |
+
target = "\n".join(parts)
|
| 309 |
+
if target:
|
| 310 |
+
return target
|
| 311 |
+
except Exception:
|
| 312 |
+
pass
|
| 313 |
+
# fallback: try crude string search for "Studio albums" marker
|
| 314 |
+
low = page_html.lower()
|
| 315 |
+
idx = low.find("studio albums")
|
| 316 |
+
if idx == -1:
|
| 317 |
+
idx = low.find("discography")
|
| 318 |
+
if idx == -1:
|
| 319 |
+
# return whole page
|
| 320 |
+
return page_html
|
| 321 |
+
# take chunk after idx
|
| 322 |
+
chunk = page_html[idx: idx + 12000] # large slice
|
| 323 |
+
return chunk
|
| 324 |
+
|
| 325 |
+
def count_albums_between(self, page_html, y_min, y_max):
|
| 326 |
+
"""From a page chunk try to extract years and count how many album entries fall into [y_min,y_max]."""
|
| 327 |
+
if not page_html:
|
| 328 |
+
return None
|
| 329 |
+
# extract lines that likely contain years
|
| 330 |
+
text = re.sub(r"<[^>]+>", " ", page_html) # drop tags crudely
|
| 331 |
+
# look for year patterns like (2001), 2001, 2001–2002 etc
|
| 332 |
+
matches = re.findall(r"(?:\b(?:19|20)\d{2}\b(?:\s*(?:–|-|to)\s*\b(?:19|20)\d{2}\b)?)", text)
|
| 333 |
+
years = []
|
| 334 |
+
for m in matches:
|
| 335 |
+
# for ranges like 2001–2003, take start year
|
| 336 |
+
sub = re.findall(r"(?:\b(?:19|20)\d{2}\b)", m)
|
| 337 |
+
for s in sub:
|
| 338 |
try:
|
| 339 |
+
years.append(int(s))
|
| 340 |
+
except:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 341 |
pass
|
| 342 |
+
if not years:
|
| 343 |
+
return None
|
| 344 |
+
# Now count how many distinct album entries fall into range.
|
| 345 |
+
# crude approach: count number of year occurrences within range
|
| 346 |
+
count = sum(1 for y in years if y_min <= y <= y_max)
|
| 347 |
+
if count == 0:
|
| 348 |
+
return None
|
| 349 |
+
return count
|
| 350 |
|
| 351 |
+
def solve_studio_albums_between(self, question):
|
| 352 |
+
# detect if question asks about studio albums between years
|
| 353 |
+
if "studio album" not in question.lower():
|
| 354 |
+
return None
|
| 355 |
+
# parse years
|
| 356 |
+
yr = self.parse_year_range(question)
|
| 357 |
+
if not yr:
|
| 358 |
+
return None
|
| 359 |
+
y_min, y_max = yr
|
| 360 |
+
# parse artist
|
| 361 |
+
artist = self.parse_artist_from_question(question)
|
| 362 |
+
if not artist:
|
| 363 |
+
# try to remove the beginning e.g., "How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)?"
|
| 364 |
+
m = re.search(r"how many studio albums .* by (.*?) between", question, re.I)
|
| 365 |
+
if m:
|
| 366 |
+
artist = m.group(1).strip()
|
| 367 |
+
if not artist:
|
| 368 |
+
return None
|
| 369 |
+
# search wiki for artist page
|
| 370 |
+
title = wikipedia_search_first_page(artist)
|
| 371 |
+
if not title:
|
| 372 |
return None
|
| 373 |
+
# get page html
|
| 374 |
+
page_html = self.wiki_get_page_html(title)
|
| 375 |
+
if not page_html:
|
| 376 |
+
# try extract text
|
| 377 |
+
extract = wikipedia_get_extract(title)
|
| 378 |
+
# fallback to finding years in extract
|
| 379 |
+
if extract:
|
| 380 |
+
yrs = re.findall(r"\b(?:19|20)\d{2}\b", extract)
|
| 381 |
+
yrs = [int(x) for x in yrs]
|
| 382 |
+
cnt = sum(1 for y in yrs if y_min <= y <= y_max)
|
| 383 |
+
if cnt:
|
| 384 |
+
return str(cnt)
|
| 385 |
+
return None
|
| 386 |
+
# extract studio section
|
| 387 |
+
sec = self.extract_studio_section_text(page_html)
|
| 388 |
+
if not sec:
|
| 389 |
+
return None
|
| 390 |
+
cnt = self.count_albums_between(sec, y_min, y_max)
|
| 391 |
+
if cnt is not None:
|
| 392 |
+
return str(cnt)
|
| 393 |
+
# last attempt: search whole-page extract from API for numbers near 'studio album' phrase
|
| 394 |
+
extract = wikipedia_get_extract(title)
|
| 395 |
+
if extract:
|
| 396 |
+
# find sentences mentioning studio album
|
| 397 |
+
sents = re.split(r'(?<=[\.\?\!])\s+', extract)
|
| 398 |
+
for s in sents:
|
| 399 |
+
if "studio album" in s.lower():
|
| 400 |
+
nums = re.findall(r"\b(?:19|20)\d{2}\b", s)
|
| 401 |
+
nums = [int(x) for x in nums]
|
| 402 |
+
cnt = sum(1 for y in nums if y_min <= y <= y_max)
|
| 403 |
+
if cnt:
|
| 404 |
+
return str(cnt)
|
| 405 |
+
return None
|
| 406 |
+
|
| 407 |
+
# slightly improved reverse detection: if question asks to reverse or contains many reversed words
|
| 408 |
+
def detect_and_reverse_text(self, question):
|
| 409 |
+
t = question.strip()
|
| 410 |
+
if "reverse" in t.lower() or "reversed" in t.lower():
|
| 411 |
+
# probably instruction; find quoted text and reverse chars
|
| 412 |
+
m = re.search(r'"(.*?)"', t)
|
| 413 |
+
if m:
|
| 414 |
+
inner = m.group(1)
|
| 415 |
+
return inner[::-1] # reverse characters for exact match
|
| 416 |
+
# else try to find long string after colon
|
| 417 |
+
m2 = re.search(r':\s*(.+)', t)
|
| 418 |
+
if m2:
|
| 419 |
+
return m2.group(1).strip()[::-1]
|
| 420 |
+
# also handle case where the question itself looks reversed: many tokens are non-words
|
| 421 |
+
words = t.split()
|
| 422 |
+
reversed_like = sum(1 for w in words if re.search(r'[a-z]{2,}', w[::-1]))
|
| 423 |
+
# crude: if more than half words are non-English-looking, try reversing entire string
|
| 424 |
+
if len(words) > 4 and reversed_like > len(words) // 2:
|
| 425 |
+
return t[::-1]
|
| 426 |
return None
|
| 427 |
|
| 428 |
+
# fallback: reuse prior solve_* and wiki heuristics
|
| 429 |
def __call__(self, question: str, task_id: str = None) -> str:
|
| 430 |
q = question or ""
|
| 431 |
+
q = q.strip()
|
| 432 |
+
print("Solving (v2):", q[:120].replace("\n", " ") + "...")
|
| 433 |
+
|
| 434 |
+
# 0. reverse instructions
|
| 435 |
+
rev = self.detect_and_reverse_text(q)
|
| 436 |
+
if rev:
|
| 437 |
+
return rev
|
| 438 |
+
|
| 439 |
+
# 1. album-specific
|
| 440 |
+
album_ans = self.solve_studio_albums_between(q)
|
| 441 |
+
if album_ans:
|
| 442 |
+
return album_ans
|
| 443 |
+
|
| 444 |
+
# 2. math
|
| 445 |
ans = self.solve_math(q)
|
| 446 |
if ans:
|
| 447 |
+
# normalize numeric strings: remove .0
|
| 448 |
+
if re.match(r"^\d+\.0+$", ans):
|
| 449 |
+
ans = str(int(float(ans)))
|
| 450 |
return ans
|
| 451 |
+
|
| 452 |
+
# 3. counting
|
| 453 |
ans = self.solve_counting(q)
|
| 454 |
if ans:
|
| 455 |
return ans
|
| 456 |
+
|
| 457 |
+
# 4. simple facts
|
| 458 |
ans = self.solve_simple_facts(q)
|
| 459 |
if ans:
|
| 460 |
return ans
|
| 461 |
+
|
| 462 |
+
# 5. wikipedia numeric heuristic
|
| 463 |
ans = self.solve_with_wikipedia(q, task_id=task_id)
|
| 464 |
if ans:
|
| 465 |
+
# return first numeric token normalized
|
| 466 |
+
if isinstance(ans, (int, float)):
|
| 467 |
+
return str(ans)
|
| 468 |
+
s = str(ans)
|
| 469 |
+
# strip decimals like '15.0' -> '15'
|
| 470 |
+
if re.match(r"^\d+\.\d+$", s):
|
| 471 |
+
try:
|
| 472 |
+
f = float(s)
|
| 473 |
+
if f.is_integer():
|
| 474 |
+
return str(int(f))
|
| 475 |
+
except:
|
| 476 |
+
pass
|
| 477 |
+
return s
|
| 478 |
|
| 479 |
+
return "unknown"
|
| 480 |
# Submission runner
|
| 481 |
# ---
|
| 482 |
def run_and_submit_all(profile: gr.OAuthProfile | None):
|