Upload 12 files
Browse files- app/__init__.py +3 -0
- app/__pycache__/__init__.cpython-311.pyc +0 -0
- app/__pycache__/browser.cpython-311.pyc +0 -0
- app/__pycache__/llm.cpython-311.pyc +0 -0
- app/__pycache__/main.cpython-311.pyc +0 -0
- app/__pycache__/solver.cpython-311.pyc +0 -0
- app/__pycache__/utils.cpython-311.pyc +0 -0
- app/browser.py +247 -0
- app/llm.py +251 -0
- app/main.py +250 -0
- app/solver.py +593 -0
- app/utils.py +180 -0
app/__init__.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# IITM LLM Quiz Solver
|
| 2 |
+
__version__ = "1.0.0"
|
| 3 |
+
|
app/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (171 Bytes). View file
|
|
|
app/__pycache__/browser.cpython-311.pyc
ADDED
|
Binary file (12.5 kB). View file
|
|
|
app/__pycache__/llm.cpython-311.pyc
ADDED
|
Binary file (10.2 kB). View file
|
|
|
app/__pycache__/main.cpython-311.pyc
ADDED
|
Binary file (11.3 kB). View file
|
|
|
app/__pycache__/solver.cpython-311.pyc
ADDED
|
Binary file (27.3 kB). View file
|
|
|
app/__pycache__/utils.cpython-311.pyc
ADDED
|
Binary file (6.5 kB). View file
|
|
|
app/browser.py
ADDED
|
@@ -0,0 +1,247 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Playwright browser helper for loading and interacting with quiz pages.
|
| 3 |
+
"""
|
| 4 |
+
import asyncio
|
| 5 |
+
import logging
|
| 6 |
+
from typing import Optional, Dict, Any, List
|
| 7 |
+
from playwright.async_api import async_playwright, Browser, Page, BrowserContext
|
| 8 |
+
import time
|
| 9 |
+
|
| 10 |
+
logger = logging.getLogger(__name__)
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class BrowserHelper:
|
| 14 |
+
"""Helper class for managing Playwright browser sessions."""
|
| 15 |
+
|
| 16 |
+
def __init__(self):
|
| 17 |
+
self.browser: Optional[Browser] = None
|
| 18 |
+
self.context: Optional[BrowserContext] = None
|
| 19 |
+
self.page: Optional[Page] = None
|
| 20 |
+
self.playwright = None
|
| 21 |
+
|
| 22 |
+
async def start(self, headless: bool = True) -> None:
|
| 23 |
+
"""
|
| 24 |
+
Start Playwright browser.
|
| 25 |
+
|
| 26 |
+
Args:
|
| 27 |
+
headless: Run in headless mode
|
| 28 |
+
"""
|
| 29 |
+
try:
|
| 30 |
+
self.playwright = await async_playwright().start()
|
| 31 |
+
self.browser = await self.playwright.chromium.launch(
|
| 32 |
+
headless=headless,
|
| 33 |
+
args=[
|
| 34 |
+
'--no-sandbox',
|
| 35 |
+
'--disable-setuid-sandbox',
|
| 36 |
+
'--disable-dev-shm-usage',
|
| 37 |
+
'--disable-accelerated-2d-canvas',
|
| 38 |
+
'--disable-gpu'
|
| 39 |
+
]
|
| 40 |
+
)
|
| 41 |
+
self.context = await self.browser.new_context(
|
| 42 |
+
viewport={'width': 1920, 'height': 1080},
|
| 43 |
+
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
| 44 |
+
)
|
| 45 |
+
self.page = await self.context.new_page()
|
| 46 |
+
logger.info("Browser started successfully")
|
| 47 |
+
except Exception as e:
|
| 48 |
+
logger.error(f"Error starting browser: {e}")
|
| 49 |
+
raise
|
| 50 |
+
|
| 51 |
+
async def load_page(self, url: str, wait_time: int = 5, timeout: int = 30000) -> Dict[str, Any]:
|
| 52 |
+
"""
|
| 53 |
+
Load a page and extract all content.
|
| 54 |
+
|
| 55 |
+
Args:
|
| 56 |
+
url: URL to load
|
| 57 |
+
wait_time: Seconds to wait for page to load
|
| 58 |
+
timeout: Page load timeout in milliseconds
|
| 59 |
+
|
| 60 |
+
Returns:
|
| 61 |
+
Dictionary with page content
|
| 62 |
+
"""
|
| 63 |
+
if not self.page:
|
| 64 |
+
await self.start()
|
| 65 |
+
|
| 66 |
+
try:
|
| 67 |
+
logger.info(f"Loading page: {url}")
|
| 68 |
+
await self.page.goto(url, wait_until='networkidle', timeout=timeout)
|
| 69 |
+
|
| 70 |
+
# Wait for dynamic content
|
| 71 |
+
await asyncio.sleep(wait_time)
|
| 72 |
+
|
| 73 |
+
# Extract page content
|
| 74 |
+
content = {
|
| 75 |
+
'url': url,
|
| 76 |
+
'title': await self.page.title(),
|
| 77 |
+
'text': await self.page.inner_text('body'),
|
| 78 |
+
'html': await self.page.content(),
|
| 79 |
+
'screenshot': await self.page.screenshot(full_page=True),
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
# Try to extract all visible text elements
|
| 83 |
+
try:
|
| 84 |
+
content['all_text'] = await self.page.evaluate("""
|
| 85 |
+
() => {
|
| 86 |
+
const walker = document.createTreeWalker(
|
| 87 |
+
document.body,
|
| 88 |
+
NodeFilter.SHOW_TEXT,
|
| 89 |
+
null,
|
| 90 |
+
false
|
| 91 |
+
);
|
| 92 |
+
let text = [];
|
| 93 |
+
let node;
|
| 94 |
+
while (node = walker.nextNode()) {
|
| 95 |
+
if (node.textContent.trim()) {
|
| 96 |
+
text.push(node.textContent.trim());
|
| 97 |
+
}
|
| 98 |
+
}
|
| 99 |
+
return text.join('\\n');
|
| 100 |
+
}
|
| 101 |
+
""")
|
| 102 |
+
except Exception as e:
|
| 103 |
+
logger.warning(f"Error extracting all text: {e}")
|
| 104 |
+
content['all_text'] = content['text']
|
| 105 |
+
|
| 106 |
+
# Extract links
|
| 107 |
+
try:
|
| 108 |
+
content['links'] = await self.page.evaluate("""
|
| 109 |
+
() => {
|
| 110 |
+
const links = Array.from(document.querySelectorAll('a[href]'));
|
| 111 |
+
return links.map(a => ({text: a.textContent.trim(), href: a.href}));
|
| 112 |
+
}
|
| 113 |
+
""")
|
| 114 |
+
except Exception as e:
|
| 115 |
+
logger.warning(f"Error extracting links: {e}")
|
| 116 |
+
content['links'] = []
|
| 117 |
+
|
| 118 |
+
# Extract images
|
| 119 |
+
try:
|
| 120 |
+
content['images'] = await self.page.evaluate("""
|
| 121 |
+
() => {
|
| 122 |
+
const images = Array.from(document.querySelectorAll('img[src]'));
|
| 123 |
+
return images.map(img => ({alt: img.alt, src: img.src}));
|
| 124 |
+
}
|
| 125 |
+
""")
|
| 126 |
+
except Exception as e:
|
| 127 |
+
logger.warning(f"Error extracting images: {e}")
|
| 128 |
+
content['images'] = []
|
| 129 |
+
|
| 130 |
+
logger.info(f"Page loaded successfully: {content['title']}")
|
| 131 |
+
return content
|
| 132 |
+
|
| 133 |
+
except Exception as e:
|
| 134 |
+
logger.error(f"Error loading page {url}: {e}")
|
| 135 |
+
raise
|
| 136 |
+
|
| 137 |
+
async def click_element(self, selector: str) -> bool:
|
| 138 |
+
"""
|
| 139 |
+
Click an element on the page.
|
| 140 |
+
|
| 141 |
+
Args:
|
| 142 |
+
selector: CSS selector
|
| 143 |
+
|
| 144 |
+
Returns:
|
| 145 |
+
True if successful
|
| 146 |
+
"""
|
| 147 |
+
try:
|
| 148 |
+
await self.page.click(selector)
|
| 149 |
+
await asyncio.sleep(1)
|
| 150 |
+
return True
|
| 151 |
+
except Exception as e:
|
| 152 |
+
logger.error(f"Error clicking element {selector}: {e}")
|
| 153 |
+
return False
|
| 154 |
+
|
| 155 |
+
async def fill_input(self, selector: str, value: str) -> bool:
|
| 156 |
+
"""
|
| 157 |
+
Fill an input field.
|
| 158 |
+
|
| 159 |
+
Args:
|
| 160 |
+
selector: CSS selector
|
| 161 |
+
value: Value to fill
|
| 162 |
+
|
| 163 |
+
Returns:
|
| 164 |
+
True if successful
|
| 165 |
+
"""
|
| 166 |
+
try:
|
| 167 |
+
await self.page.fill(selector, value)
|
| 168 |
+
return True
|
| 169 |
+
except Exception as e:
|
| 170 |
+
logger.error(f"Error filling input {selector}: {e}")
|
| 171 |
+
return False
|
| 172 |
+
|
| 173 |
+
async def wait_for_element(self, selector: str, timeout: int = 10000) -> bool:
|
| 174 |
+
"""
|
| 175 |
+
Wait for an element to appear.
|
| 176 |
+
|
| 177 |
+
Args:
|
| 178 |
+
selector: CSS selector
|
| 179 |
+
timeout: Timeout in milliseconds
|
| 180 |
+
|
| 181 |
+
Returns:
|
| 182 |
+
True if element found
|
| 183 |
+
"""
|
| 184 |
+
try:
|
| 185 |
+
await self.page.wait_for_selector(selector, timeout=timeout)
|
| 186 |
+
return True
|
| 187 |
+
except Exception as e:
|
| 188 |
+
logger.warning(f"Element {selector} not found: {e}")
|
| 189 |
+
return False
|
| 190 |
+
|
| 191 |
+
async def evaluate_script(self, script: str) -> Any:
|
| 192 |
+
"""
|
| 193 |
+
Execute JavaScript on the page.
|
| 194 |
+
|
| 195 |
+
Args:
|
| 196 |
+
script: JavaScript code to execute
|
| 197 |
+
|
| 198 |
+
Returns:
|
| 199 |
+
Result of script execution
|
| 200 |
+
"""
|
| 201 |
+
try:
|
| 202 |
+
return await self.page.evaluate(script)
|
| 203 |
+
except Exception as e:
|
| 204 |
+
logger.error(f"Error evaluating script: {e}")
|
| 205 |
+
return None
|
| 206 |
+
|
| 207 |
+
async def close(self) -> None:
|
| 208 |
+
"""Close browser and cleanup."""
|
| 209 |
+
try:
|
| 210 |
+
if self.page:
|
| 211 |
+
await self.page.close()
|
| 212 |
+
if self.context:
|
| 213 |
+
await self.context.close()
|
| 214 |
+
if self.browser:
|
| 215 |
+
await self.browser.close()
|
| 216 |
+
if self.playwright:
|
| 217 |
+
await self.playwright.stop()
|
| 218 |
+
logger.info("Browser closed")
|
| 219 |
+
except Exception as e:
|
| 220 |
+
logger.error(f"Error closing browser: {e}")
|
| 221 |
+
|
| 222 |
+
|
| 223 |
+
# Global browser instance
|
| 224 |
+
_browser: Optional[BrowserHelper] = None
|
| 225 |
+
|
| 226 |
+
|
| 227 |
+
async def get_browser() -> BrowserHelper:
|
| 228 |
+
"""
|
| 229 |
+
Get or create a browser instance.
|
| 230 |
+
|
| 231 |
+
Returns:
|
| 232 |
+
BrowserHelper instance
|
| 233 |
+
"""
|
| 234 |
+
global _browser
|
| 235 |
+
if _browser is None:
|
| 236 |
+
_browser = BrowserHelper()
|
| 237 |
+
await _browser.start()
|
| 238 |
+
return _browser
|
| 239 |
+
|
| 240 |
+
|
| 241 |
+
async def cleanup_browser() -> None:
|
| 242 |
+
"""Cleanup browser instance."""
|
| 243 |
+
global _browser
|
| 244 |
+
if _browser:
|
| 245 |
+
await _browser.close()
|
| 246 |
+
_browser = None
|
| 247 |
+
|
app/llm.py
ADDED
|
@@ -0,0 +1,251 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
LLM helper module for OpenAI GPT integration.
|
| 3 |
+
Used for reasoning, OCR, and complex question parsing.
|
| 4 |
+
"""
|
| 5 |
+
import os
|
| 6 |
+
import logging
|
| 7 |
+
from typing import Optional, Dict, Any
|
| 8 |
+
import openai
|
| 9 |
+
from openai import AsyncOpenAI
|
| 10 |
+
import httpx
|
| 11 |
+
|
| 12 |
+
logger = logging.getLogger(__name__)
|
| 13 |
+
|
| 14 |
+
# Initialize OpenAI client
|
| 15 |
+
client: Optional[AsyncOpenAI] = None
|
| 16 |
+
|
| 17 |
+
# OpenRouter configuration
|
| 18 |
+
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
|
| 19 |
+
OPENROUTER_BASE_URL = os.getenv("OPENROUTER_BASE_URL", "https://openrouter.ai/api/v1")
|
| 20 |
+
OPENROUTER_MODEL = os.getenv("OPENROUTER_MODEL", "gpt-5-nano")
|
| 21 |
+
OPENROUTER_SITE_URL = os.getenv("OPENROUTER_SITE_URL", "http://localhost")
|
| 22 |
+
OPENROUTER_APP_NAME = os.getenv("OPENROUTER_APP_NAME", "IITM LLM Quiz Solver")
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def initialize_llm() -> None:
|
| 26 |
+
"""
|
| 27 |
+
Initialize OpenAI client with API key from environment.
|
| 28 |
+
"""
|
| 29 |
+
global client
|
| 30 |
+
api_key = os.getenv("OPENAI_API_KEY")
|
| 31 |
+
if api_key:
|
| 32 |
+
client = AsyncOpenAI(api_key=api_key)
|
| 33 |
+
logger.info("OpenAI client initialized")
|
| 34 |
+
else:
|
| 35 |
+
if OPENROUTER_API_KEY:
|
| 36 |
+
logger.info("OPENAI_API_KEY not set, using OpenRouter only")
|
| 37 |
+
else:
|
| 38 |
+
logger.warning("No OPENAI_API_KEY or OPENROUTER_API_KEY set, LLM features will be disabled")
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
async def ask_gpt(prompt: str, model: str = "gpt-4o-mini", max_tokens: int = 2000) -> Optional[str]:
|
| 42 |
+
"""
|
| 43 |
+
Query OpenAI GPT model with a prompt.
|
| 44 |
+
|
| 45 |
+
Args:
|
| 46 |
+
prompt: The prompt/question to ask
|
| 47 |
+
model: Model to use (default: gpt-4o-mini)
|
| 48 |
+
max_tokens: Maximum tokens in response
|
| 49 |
+
|
| 50 |
+
Returns:
|
| 51 |
+
Response text or None if error
|
| 52 |
+
"""
|
| 53 |
+
global client
|
| 54 |
+
|
| 55 |
+
try:
|
| 56 |
+
if client:
|
| 57 |
+
response = await client.chat.completions.create(
|
| 58 |
+
model=model,
|
| 59 |
+
messages=[
|
| 60 |
+
{"role": "system", "content": "You are a helpful assistant that solves quiz questions accurately and concisely."},
|
| 61 |
+
{"role": "user", "content": prompt}
|
| 62 |
+
],
|
| 63 |
+
max_tokens=max_tokens,
|
| 64 |
+
temperature=0.3
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
answer = response.choices[0].message.content
|
| 68 |
+
logger.info(f"GPT response received (model: {model})")
|
| 69 |
+
return answer
|
| 70 |
+
else:
|
| 71 |
+
logger.warning("OpenAI client not initialized, attempting OpenRouter fallback")
|
| 72 |
+
return await ask_openrouter(prompt, max_tokens=max_tokens)
|
| 73 |
+
|
| 74 |
+
except Exception as e:
|
| 75 |
+
logger.error(f"Error calling OpenAI API: {e}")
|
| 76 |
+
# Fallback to OpenRouter if configured
|
| 77 |
+
fallback = await ask_openrouter(prompt, max_tokens=max_tokens)
|
| 78 |
+
if fallback:
|
| 79 |
+
return fallback
|
| 80 |
+
return None
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
async def ask_openrouter(prompt: str, model: Optional[str] = None, max_tokens: int = 2000) -> Optional[str]:
|
| 84 |
+
"""
|
| 85 |
+
Query OpenRouter (e.g., GPT-5-nano) with a prompt.
|
| 86 |
+
|
| 87 |
+
Args:
|
| 88 |
+
prompt: Prompt text
|
| 89 |
+
model: Model to use (defaults to OPENROUTER_MODEL)
|
| 90 |
+
max_tokens: Maximum tokens
|
| 91 |
+
|
| 92 |
+
Returns:
|
| 93 |
+
Response text or None
|
| 94 |
+
"""
|
| 95 |
+
if not OPENROUTER_API_KEY:
|
| 96 |
+
logger.warning("OPENROUTER_API_KEY not set, cannot call OpenRouter")
|
| 97 |
+
return None
|
| 98 |
+
|
| 99 |
+
if not model:
|
| 100 |
+
model = OPENROUTER_MODEL
|
| 101 |
+
|
| 102 |
+
url = f"{OPENROUTER_BASE_URL.rstrip('/')}/chat/completions"
|
| 103 |
+
headers = {
|
| 104 |
+
"Authorization": f"Bearer {OPENROUTER_API_KEY}",
|
| 105 |
+
"HTTP-Referer": OPENROUTER_SITE_URL,
|
| 106 |
+
"X-Title": OPENROUTER_APP_NAME,
|
| 107 |
+
"Content-Type": "application/json",
|
| 108 |
+
}
|
| 109 |
+
payload = {
|
| 110 |
+
"model": model,
|
| 111 |
+
"messages": [
|
| 112 |
+
{"role": "system", "content": "You are a helpful assistant that solves quiz questions accurately and concisely."},
|
| 113 |
+
{"role": "user", "content": prompt}
|
| 114 |
+
],
|
| 115 |
+
"max_tokens": max_tokens,
|
| 116 |
+
"temperature": 0.2
|
| 117 |
+
}
|
| 118 |
+
|
| 119 |
+
try:
|
| 120 |
+
async with httpx.AsyncClient(timeout=60) as http_client:
|
| 121 |
+
response = await http_client.post(url, headers=headers, json=payload)
|
| 122 |
+
response.raise_for_status()
|
| 123 |
+
data = response.json()
|
| 124 |
+
answer = data["choices"][0]["message"]["content"]
|
| 125 |
+
logger.info(f"OpenRouter response received (model: {model})")
|
| 126 |
+
return answer
|
| 127 |
+
except Exception as e:
|
| 128 |
+
logger.error(f"Error calling OpenRouter API: {e}")
|
| 129 |
+
return None
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
async def parse_question_with_llm(question_text: str, context: str = "") -> Optional[Dict[str, Any]]:
|
| 133 |
+
"""
|
| 134 |
+
Use LLM to parse and understand a quiz question.
|
| 135 |
+
|
| 136 |
+
Args:
|
| 137 |
+
question_text: The question text
|
| 138 |
+
context: Additional context from the page
|
| 139 |
+
|
| 140 |
+
Returns:
|
| 141 |
+
Parsed question structure or None
|
| 142 |
+
"""
|
| 143 |
+
prompt = f"""Analyze this quiz question and provide a structured response:
|
| 144 |
+
|
| 145 |
+
Question: {question_text}
|
| 146 |
+
|
| 147 |
+
Context: {context}
|
| 148 |
+
|
| 149 |
+
Please identify:
|
| 150 |
+
1. What type of question is this? (scraping, calculation, API call, data analysis, etc.)
|
| 151 |
+
2. What data or resources are needed?
|
| 152 |
+
3. What is the expected answer format? (JSON, number, text, etc.)
|
| 153 |
+
|
| 154 |
+
Respond in JSON format:
|
| 155 |
+
{{
|
| 156 |
+
"type": "question_type",
|
| 157 |
+
"requirements": ["requirement1", "requirement2"],
|
| 158 |
+
"answer_format": "format_type",
|
| 159 |
+
"reasoning": "your reasoning"
|
| 160 |
+
}}
|
| 161 |
+
"""
|
| 162 |
+
|
| 163 |
+
response = await ask_gpt(prompt)
|
| 164 |
+
if not response:
|
| 165 |
+
return None
|
| 166 |
+
|
| 167 |
+
# Try to extract JSON from response
|
| 168 |
+
import json
|
| 169 |
+
import re
|
| 170 |
+
|
| 171 |
+
json_match = re.search(r'\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}', response, re.DOTALL)
|
| 172 |
+
if json_match:
|
| 173 |
+
try:
|
| 174 |
+
return json.loads(json_match.group())
|
| 175 |
+
except json.JSONDecodeError:
|
| 176 |
+
pass
|
| 177 |
+
|
| 178 |
+
return {"raw_response": response}
|
| 179 |
+
|
| 180 |
+
|
| 181 |
+
async def solve_with_llm(question: str, available_data: Dict[str, Any]) -> Optional[str]:
|
| 182 |
+
"""
|
| 183 |
+
Use LLM to solve a quiz question.
|
| 184 |
+
|
| 185 |
+
Args:
|
| 186 |
+
question: The question text
|
| 187 |
+
available_data: Any data extracted from the page
|
| 188 |
+
|
| 189 |
+
Returns:
|
| 190 |
+
Answer or None
|
| 191 |
+
"""
|
| 192 |
+
prompt = f"""Solve this quiz question:
|
| 193 |
+
|
| 194 |
+
Question: {question}
|
| 195 |
+
|
| 196 |
+
Available Data:
|
| 197 |
+
{available_data}
|
| 198 |
+
|
| 199 |
+
Provide a clear, concise answer. If the answer should be in JSON format, provide valid JSON.
|
| 200 |
+
If it's a calculation, show your work briefly.
|
| 201 |
+
"""
|
| 202 |
+
|
| 203 |
+
return await ask_gpt(prompt, max_tokens=3000)
|
| 204 |
+
|
| 205 |
+
|
| 206 |
+
async def ocr_image_with_llm(image_base64: str) -> Optional[str]:
|
| 207 |
+
"""
|
| 208 |
+
Use GPT-4 Vision to extract text from an image.
|
| 209 |
+
|
| 210 |
+
Note: Requires GPT-4 Vision model (gpt-4o or gpt-4-vision-preview).
|
| 211 |
+
gpt-4o-mini does not support vision.
|
| 212 |
+
|
| 213 |
+
Args:
|
| 214 |
+
image_base64: Base64 encoded image
|
| 215 |
+
|
| 216 |
+
Returns:
|
| 217 |
+
Extracted text or None
|
| 218 |
+
"""
|
| 219 |
+
global client
|
| 220 |
+
|
| 221 |
+
if not client:
|
| 222 |
+
return None
|
| 223 |
+
|
| 224 |
+
# Try vision-capable models
|
| 225 |
+
vision_models = ["gpt-4o", "gpt-4-vision-preview"]
|
| 226 |
+
|
| 227 |
+
for model in vision_models:
|
| 228 |
+
try:
|
| 229 |
+
response = await client.chat.completions.create(
|
| 230 |
+
model=model,
|
| 231 |
+
messages=[
|
| 232 |
+
{
|
| 233 |
+
"role": "user",
|
| 234 |
+
"content": [
|
| 235 |
+
{"type": "text", "text": "Extract all text from this image. Return only the text content."},
|
| 236 |
+
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}}
|
| 237 |
+
]
|
| 238 |
+
}
|
| 239 |
+
],
|
| 240 |
+
max_tokens=1000
|
| 241 |
+
)
|
| 242 |
+
|
| 243 |
+
return response.choices[0].message.content
|
| 244 |
+
|
| 245 |
+
except Exception as e:
|
| 246 |
+
logger.warning(f"Error with model {model}: {e}")
|
| 247 |
+
continue
|
| 248 |
+
|
| 249 |
+
logger.error("No vision-capable model available")
|
| 250 |
+
return None
|
| 251 |
+
|
app/main.py
ADDED
|
@@ -0,0 +1,250 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
FastAPI main server for IITM LLM Quiz Solver.
|
| 3 |
+
"""
|
| 4 |
+
import os
|
| 5 |
+
import logging
|
| 6 |
+
import asyncio
|
| 7 |
+
from typing import Dict, Any, Optional
|
| 8 |
+
from fastapi import FastAPI, HTTPException, Request
|
| 9 |
+
from fastapi.responses import JSONResponse
|
| 10 |
+
from pydantic import BaseModel, Field, field_validator
|
| 11 |
+
import uvicorn
|
| 12 |
+
|
| 13 |
+
# Try to load .env file if python-dotenv is available
|
| 14 |
+
try:
|
| 15 |
+
from dotenv import load_dotenv
|
| 16 |
+
load_dotenv()
|
| 17 |
+
except ImportError:
|
| 18 |
+
pass # python-dotenv is optional
|
| 19 |
+
|
| 20 |
+
from app.solver import solve_quiz
|
| 21 |
+
from app.utils import validate_secret
|
| 22 |
+
from app.browser import cleanup_browser
|
| 23 |
+
|
| 24 |
+
# Configure logging
|
| 25 |
+
logging.basicConfig(
|
| 26 |
+
level=logging.INFO,
|
| 27 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
| 28 |
+
)
|
| 29 |
+
logger = logging.getLogger(__name__)
|
| 30 |
+
|
| 31 |
+
# Get secret from environment
|
| 32 |
+
EXPECTED_SECRET = os.getenv("QUIZ_SECRET", "default_secret_change_me")
|
| 33 |
+
|
| 34 |
+
# Lifespan context manager for startup and shutdown
|
| 35 |
+
from contextlib import asynccontextmanager
|
| 36 |
+
|
| 37 |
+
@asynccontextmanager
|
| 38 |
+
async def lifespan(app: FastAPI):
|
| 39 |
+
"""Lifespan context manager for startup and shutdown."""
|
| 40 |
+
# Startup
|
| 41 |
+
logger.info("Application starting up...")
|
| 42 |
+
yield
|
| 43 |
+
# Shutdown
|
| 44 |
+
logger.info("Shutting down, cleaning up browser...")
|
| 45 |
+
await cleanup_browser()
|
| 46 |
+
|
| 47 |
+
# Initialize FastAPI app with lifespan
|
| 48 |
+
app = FastAPI(
|
| 49 |
+
title="IITM LLM Quiz Solver",
|
| 50 |
+
description="API endpoint to automatically solve dynamic quiz tasks",
|
| 51 |
+
version="1.0.0",
|
| 52 |
+
lifespan=lifespan
|
| 53 |
+
)
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
class QuizRequest(BaseModel):
|
| 57 |
+
"""Request model for quiz solving."""
|
| 58 |
+
email: str = Field(..., description="User email address")
|
| 59 |
+
secret: str = Field(..., description="Secret key for authentication")
|
| 60 |
+
url: str = Field(..., description="Quiz page URL")
|
| 61 |
+
|
| 62 |
+
@field_validator('email')
|
| 63 |
+
@classmethod
|
| 64 |
+
def validate_email(cls, v):
|
| 65 |
+
if not v or '@' not in v:
|
| 66 |
+
raise ValueError('Invalid email format')
|
| 67 |
+
return v
|
| 68 |
+
|
| 69 |
+
@field_validator('url')
|
| 70 |
+
@classmethod
|
| 71 |
+
def validate_url(cls, v):
|
| 72 |
+
if not v or not v.startswith(('http://', 'https://')):
|
| 73 |
+
raise ValueError('Invalid URL format')
|
| 74 |
+
return v
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
@app.get("/")
|
| 78 |
+
async def root():
|
| 79 |
+
"""Root endpoint."""
|
| 80 |
+
return {
|
| 81 |
+
"message": "IITM LLM Quiz Solver API",
|
| 82 |
+
"version": "1.0.0",
|
| 83 |
+
"endpoints": {
|
| 84 |
+
"/solve": "POST - Solve a quiz",
|
| 85 |
+
"/health": "GET - Health check",
|
| 86 |
+
"/demo": "POST - Demo endpoint"
|
| 87 |
+
}
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
@app.get("/health")
|
| 92 |
+
async def health_check():
|
| 93 |
+
"""Health check endpoint."""
|
| 94 |
+
return {"status": "healthy"}
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
@app.get("/env-check")
|
| 98 |
+
async def env_check():
|
| 99 |
+
"""
|
| 100 |
+
Check environment variables status (returns JSON).
|
| 101 |
+
Useful for verifying configuration.
|
| 102 |
+
"""
|
| 103 |
+
quiz_secret = os.getenv("QUIZ_SECRET")
|
| 104 |
+
openai_key = os.getenv("OPENAI_API_KEY")
|
| 105 |
+
openrouter_key = os.getenv("OPENROUTER_API_KEY")
|
| 106 |
+
port = os.getenv("PORT", "8000")
|
| 107 |
+
|
| 108 |
+
return {
|
| 109 |
+
"status": "ok",
|
| 110 |
+
"variables": {
|
| 111 |
+
"QUIZ_SECRET": {
|
| 112 |
+
"set": quiz_secret is not None,
|
| 113 |
+
"length": len(quiz_secret) if quiz_secret else 0,
|
| 114 |
+
"preview": f"{quiz_secret[:4]}...{quiz_secret[-4:]}" if quiz_secret and len(quiz_secret) > 8 else "***" if quiz_secret else None
|
| 115 |
+
},
|
| 116 |
+
"OPENAI_API_KEY": {
|
| 117 |
+
"set": openai_key is not None,
|
| 118 |
+
"length": len(openai_key) if openai_key else 0,
|
| 119 |
+
"preview": f"{openai_key[:7]}...{openai_key[-4:]}" if openai_key and len(openai_key) > 11 else "***" if openai_key else None,
|
| 120 |
+
"valid_format": openai_key.startswith("sk-") if openai_key else False
|
| 121 |
+
},
|
| 122 |
+
"OPENROUTER_API_KEY": {
|
| 123 |
+
"set": openrouter_key is not None,
|
| 124 |
+
"length": len(openrouter_key) if openrouter_key else 0,
|
| 125 |
+
"preview": f"{openrouter_key[:7]}...{openrouter_key[-4:]}" if openrouter_key and len(openrouter_key) > 11 else "***" if openrouter_key else None,
|
| 126 |
+
"valid_format": openrouter_key.startswith("sk-or-") if openrouter_key else False
|
| 127 |
+
},
|
| 128 |
+
"PORT": {
|
| 129 |
+
"set": True,
|
| 130 |
+
"value": port
|
| 131 |
+
}
|
| 132 |
+
},
|
| 133 |
+
"ready": quiz_secret is not None,
|
| 134 |
+
"llm_enabled": any([openai_key, openrouter_key])
|
| 135 |
+
}
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
@app.post("/solve")
|
| 139 |
+
async def solve_quiz_endpoint(request: QuizRequest):
|
| 140 |
+
"""
|
| 141 |
+
Main endpoint to solve a quiz.
|
| 142 |
+
|
| 143 |
+
Validates secret and solves the quiz recursively.
|
| 144 |
+
"""
|
| 145 |
+
try:
|
| 146 |
+
# Validate secret
|
| 147 |
+
if not validate_secret(request.secret, EXPECTED_SECRET):
|
| 148 |
+
logger.warning(f"Invalid secret provided for email: {request.email}")
|
| 149 |
+
raise HTTPException(
|
| 150 |
+
status_code=403,
|
| 151 |
+
detail={"error": "forbidden"}
|
| 152 |
+
)
|
| 153 |
+
|
| 154 |
+
logger.info(f"Solving quiz for {request.email} at {request.url}")
|
| 155 |
+
|
| 156 |
+
# Solve quiz with timeout
|
| 157 |
+
try:
|
| 158 |
+
result = await asyncio.wait_for(
|
| 159 |
+
solve_quiz(request.url, request.email, request.secret),
|
| 160 |
+
timeout=180.0 # 3 minutes
|
| 161 |
+
)
|
| 162 |
+
return result
|
| 163 |
+
except asyncio.TimeoutError:
|
| 164 |
+
logger.error("Quiz solving timed out")
|
| 165 |
+
raise HTTPException(
|
| 166 |
+
status_code=504,
|
| 167 |
+
detail={"error": "Request timeout - quiz solving took too long"}
|
| 168 |
+
)
|
| 169 |
+
except Exception as e:
|
| 170 |
+
logger.error(f"Error solving quiz: {e}", exc_info=True)
|
| 171 |
+
raise HTTPException(
|
| 172 |
+
status_code=500,
|
| 173 |
+
detail={"error": str(e)}
|
| 174 |
+
)
|
| 175 |
+
|
| 176 |
+
except HTTPException:
|
| 177 |
+
raise
|
| 178 |
+
except ValueError as e:
|
| 179 |
+
logger.error(f"Validation error: {e}")
|
| 180 |
+
raise HTTPException(
|
| 181 |
+
status_code=400,
|
| 182 |
+
detail={"error": "Invalid request format", "message": str(e)}
|
| 183 |
+
)
|
| 184 |
+
except Exception as e:
|
| 185 |
+
logger.error(f"Unexpected error: {e}", exc_info=True)
|
| 186 |
+
raise HTTPException(
|
| 187 |
+
status_code=500,
|
| 188 |
+
detail={"error": "Internal server error", "message": str(e)}
|
| 189 |
+
)
|
| 190 |
+
|
| 191 |
+
|
| 192 |
+
@app.post("/demo")
|
| 193 |
+
async def demo_endpoint(request: QuizRequest):
|
| 194 |
+
"""
|
| 195 |
+
Demo endpoint for testing.
|
| 196 |
+
|
| 197 |
+
Same as /solve but with more lenient error handling.
|
| 198 |
+
"""
|
| 199 |
+
try:
|
| 200 |
+
# Validate secret (can be more lenient for demo)
|
| 201 |
+
if not validate_secret(request.secret, EXPECTED_SECRET):
|
| 202 |
+
logger.warning(f"Invalid secret in demo request")
|
| 203 |
+
return JSONResponse(
|
| 204 |
+
status_code=403,
|
| 205 |
+
content={"error": "forbidden"}
|
| 206 |
+
)
|
| 207 |
+
|
| 208 |
+
logger.info(f"Demo: Solving quiz for {request.email} at {request.url}")
|
| 209 |
+
|
| 210 |
+
# Solve quiz
|
| 211 |
+
try:
|
| 212 |
+
result = await asyncio.wait_for(
|
| 213 |
+
solve_quiz(request.url, request.email, request.secret),
|
| 214 |
+
timeout=180.0
|
| 215 |
+
)
|
| 216 |
+
return result
|
| 217 |
+
except asyncio.TimeoutError:
|
| 218 |
+
return JSONResponse(
|
| 219 |
+
status_code=504,
|
| 220 |
+
content={"error": "Request timeout"}
|
| 221 |
+
)
|
| 222 |
+
except Exception as e:
|
| 223 |
+
logger.error(f"Error in demo: {e}", exc_info=True)
|
| 224 |
+
return JSONResponse(
|
| 225 |
+
status_code=500,
|
| 226 |
+
content={"error": str(e)}
|
| 227 |
+
)
|
| 228 |
+
|
| 229 |
+
except ValueError as e:
|
| 230 |
+
return JSONResponse(
|
| 231 |
+
status_code=400,
|
| 232 |
+
content={"error": "Invalid request format", "message": str(e)}
|
| 233 |
+
)
|
| 234 |
+
except Exception as e:
|
| 235 |
+
logger.error(f"Unexpected error in demo: {e}", exc_info=True)
|
| 236 |
+
return JSONResponse(
|
| 237 |
+
status_code=500,
|
| 238 |
+
content={"error": "Internal server error", "message": str(e)}
|
| 239 |
+
)
|
| 240 |
+
|
| 241 |
+
|
| 242 |
+
if __name__ == "__main__":
|
| 243 |
+
port = int(os.getenv("PORT", 8000))
|
| 244 |
+
uvicorn.run(
|
| 245 |
+
"app.main:app",
|
| 246 |
+
host="0.0.0.0",
|
| 247 |
+
port=port,
|
| 248 |
+
log_level="info"
|
| 249 |
+
)
|
| 250 |
+
|
app/solver.py
ADDED
|
@@ -0,0 +1,593 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Quiz solver module - main logic for solving quizzes.
|
| 3 |
+
"""
|
| 4 |
+
import asyncio
|
| 5 |
+
import json
|
| 6 |
+
import logging
|
| 7 |
+
import re
|
| 8 |
+
from typing import Optional, Dict, Any, List
|
| 9 |
+
import requests
|
| 10 |
+
from bs4 import BeautifulSoup
|
| 11 |
+
import pandas as pd
|
| 12 |
+
import io
|
| 13 |
+
import base64
|
| 14 |
+
|
| 15 |
+
from app.browser import get_browser, cleanup_browser
|
| 16 |
+
from app.llm import ask_gpt, parse_question_with_llm, solve_with_llm, initialize_llm
|
| 17 |
+
from app.utils import extract_submit_url, clean_text, extract_json_from_text, is_valid_url
|
| 18 |
+
|
| 19 |
+
logger = logging.getLogger(__name__)
|
| 20 |
+
|
| 21 |
+
# Initialize LLM on module load
|
| 22 |
+
initialize_llm()
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
class QuizSolver:
|
| 26 |
+
"""Main quiz solver class."""
|
| 27 |
+
|
| 28 |
+
def __init__(self):
|
| 29 |
+
self.browser = None
|
| 30 |
+
self.max_recursion = 10
|
| 31 |
+
self.current_recursion = 0
|
| 32 |
+
|
| 33 |
+
async def solve_quiz(self, url: str, email: str, secret: str) -> Dict[str, Any]:
|
| 34 |
+
"""
|
| 35 |
+
Main entry point for solving a quiz.
|
| 36 |
+
|
| 37 |
+
Args:
|
| 38 |
+
url: Quiz page URL
|
| 39 |
+
email: User email
|
| 40 |
+
secret: Secret key
|
| 41 |
+
|
| 42 |
+
Returns:
|
| 43 |
+
Final response from quiz system
|
| 44 |
+
"""
|
| 45 |
+
self.current_recursion = 0
|
| 46 |
+
self.browser = await get_browser()
|
| 47 |
+
|
| 48 |
+
try:
|
| 49 |
+
return await self._solve_recursive(url, email, secret)
|
| 50 |
+
finally:
|
| 51 |
+
# Don't close browser here as it might be reused
|
| 52 |
+
pass
|
| 53 |
+
|
| 54 |
+
async def _solve_recursive(self, url: str, email: str, secret: str) -> Dict[str, Any]:
|
| 55 |
+
"""
|
| 56 |
+
Recursively solve quizzes.
|
| 57 |
+
|
| 58 |
+
Args:
|
| 59 |
+
url: Current quiz URL
|
| 60 |
+
email: User email
|
| 61 |
+
secret: Secret key
|
| 62 |
+
|
| 63 |
+
Returns:
|
| 64 |
+
Response from quiz system
|
| 65 |
+
"""
|
| 66 |
+
if self.current_recursion >= self.max_recursion:
|
| 67 |
+
logger.error("Maximum recursion depth reached")
|
| 68 |
+
return {"error": "Maximum recursion depth reached"}
|
| 69 |
+
|
| 70 |
+
self.current_recursion += 1
|
| 71 |
+
logger.info(f"Solving quiz {self.current_recursion}: {url}")
|
| 72 |
+
|
| 73 |
+
try:
|
| 74 |
+
# Load the quiz page
|
| 75 |
+
page_content = await self.browser.load_page(url, wait_time=3)
|
| 76 |
+
|
| 77 |
+
# Extract submit URL
|
| 78 |
+
submit_url = extract_submit_url(page_content['text'], url)
|
| 79 |
+
if not submit_url:
|
| 80 |
+
# Try from HTML
|
| 81 |
+
soup = BeautifulSoup(page_content['html'], 'html.parser')
|
| 82 |
+
submit_url = extract_submit_url(soup.get_text(), url)
|
| 83 |
+
|
| 84 |
+
if not submit_url:
|
| 85 |
+
logger.error("Could not find submit URL")
|
| 86 |
+
return {"error": "Submit URL not found"}
|
| 87 |
+
|
| 88 |
+
# Extract question and solve
|
| 89 |
+
question_text = self._extract_question(page_content)
|
| 90 |
+
logger.info(f"Question extracted: {question_text[:200]}...")
|
| 91 |
+
|
| 92 |
+
# Solve the question
|
| 93 |
+
answer = await self._solve_question(question_text, page_content)
|
| 94 |
+
|
| 95 |
+
# Ensure answer is in the correct format (string or simple JSON-serializable)
|
| 96 |
+
answer = self._normalize_answer(answer)
|
| 97 |
+
logger.info(f"Answer computed: {str(answer)[:200]}...")
|
| 98 |
+
|
| 99 |
+
# Submit answer
|
| 100 |
+
response = await self._submit_answer(
|
| 101 |
+
submit_url, email, secret, url, answer
|
| 102 |
+
)
|
| 103 |
+
|
| 104 |
+
# Check if there's a next quiz
|
| 105 |
+
if isinstance(response, dict) and 'url' in response:
|
| 106 |
+
next_url = response['url']
|
| 107 |
+
if next_url and next_url != url and is_valid_url(next_url):
|
| 108 |
+
logger.info(f"Next quiz found: {next_url}")
|
| 109 |
+
# Recursively solve next quiz
|
| 110 |
+
next_response = await self._solve_recursive(next_url, email, secret)
|
| 111 |
+
return next_response
|
| 112 |
+
|
| 113 |
+
return response
|
| 114 |
+
|
| 115 |
+
except Exception as e:
|
| 116 |
+
logger.error(f"Error solving quiz: {e}", exc_info=True)
|
| 117 |
+
return {"error": str(e)}
|
| 118 |
+
|
| 119 |
+
def _extract_question(self, page_content: Dict[str, Any]) -> str:
|
| 120 |
+
"""
|
| 121 |
+
Extract question text from page content.
|
| 122 |
+
|
| 123 |
+
Args:
|
| 124 |
+
page_content: Page content dictionary
|
| 125 |
+
|
| 126 |
+
Returns:
|
| 127 |
+
Question text
|
| 128 |
+
"""
|
| 129 |
+
text = page_content.get('all_text', page_content.get('text', ''))
|
| 130 |
+
|
| 131 |
+
# Try to find question markers
|
| 132 |
+
question_patterns = [
|
| 133 |
+
r'[Qq]uestion[:\s]+(.*?)(?:\n\n|\n[A-Z]|$)',
|
| 134 |
+
r'[Pp]roblem[:\s]+(.*?)(?:\n\n|\n[A-Z]|$)',
|
| 135 |
+
r'[Tt]ask[:\s]+(.*?)(?:\n\n|\n[A-Z]|$)',
|
| 136 |
+
]
|
| 137 |
+
|
| 138 |
+
for pattern in question_patterns:
|
| 139 |
+
match = re.search(pattern, text, re.DOTALL | re.IGNORECASE)
|
| 140 |
+
if match:
|
| 141 |
+
return clean_text(match.group(1))
|
| 142 |
+
|
| 143 |
+
# If no pattern matches, return first substantial paragraph
|
| 144 |
+
paragraphs = [p.strip() for p in text.split('\n\n') if len(p.strip()) > 50]
|
| 145 |
+
if paragraphs:
|
| 146 |
+
return paragraphs[0]
|
| 147 |
+
|
| 148 |
+
return clean_text(text[:1000]) # Return first 1000 chars
|
| 149 |
+
|
| 150 |
+
async def _solve_question(self, question: str, page_content: Dict[str, Any]) -> Any:
|
| 151 |
+
"""
|
| 152 |
+
Solve a quiz question using various strategies.
|
| 153 |
+
|
| 154 |
+
Args:
|
| 155 |
+
question: Question text
|
| 156 |
+
page_content: Full page content
|
| 157 |
+
|
| 158 |
+
Returns:
|
| 159 |
+
Answer (can be dict, list, string, number, etc.)
|
| 160 |
+
"""
|
| 161 |
+
logger.info("Analyzing question type...")
|
| 162 |
+
|
| 163 |
+
# Try to parse question with LLM first
|
| 164 |
+
parsed = await parse_question_with_llm(question, page_content.get('text', ''))
|
| 165 |
+
|
| 166 |
+
# Extract data from page
|
| 167 |
+
available_data = self._extract_data_from_page(page_content)
|
| 168 |
+
|
| 169 |
+
# Strategy 1: Check if answer is already in the page
|
| 170 |
+
answer_in_page = self._find_answer_in_page(page_content, question)
|
| 171 |
+
if answer_in_page:
|
| 172 |
+
logger.info("Answer found in page content")
|
| 173 |
+
return answer_in_page
|
| 174 |
+
|
| 175 |
+
# Strategy 2: Check for data files/links to download
|
| 176 |
+
data_files = self._find_data_files(page_content)
|
| 177 |
+
if data_files:
|
| 178 |
+
logger.info(f"Found data files: {data_files}")
|
| 179 |
+
processed_data = await self._process_data_files(data_files)
|
| 180 |
+
if processed_data:
|
| 181 |
+
answer = await self._solve_with_data(question, processed_data)
|
| 182 |
+
if answer:
|
| 183 |
+
return answer
|
| 184 |
+
|
| 185 |
+
# Strategy 3: Use LLM to solve
|
| 186 |
+
logger.info("Attempting to solve with LLM...")
|
| 187 |
+
llm_answer = await solve_with_llm(question, available_data)
|
| 188 |
+
if llm_answer:
|
| 189 |
+
# Try to parse as JSON if it looks like JSON
|
| 190 |
+
json_answer = extract_json_from_text(llm_answer)
|
| 191 |
+
if json_answer:
|
| 192 |
+
return json_answer
|
| 193 |
+
return llm_answer
|
| 194 |
+
|
| 195 |
+
# Strategy 4: Fallback - try to extract a simple answer from the question
|
| 196 |
+
# Many quiz pages have the answer in the question itself
|
| 197 |
+
simple_answer = self._extract_simple_answer(question, page_content)
|
| 198 |
+
if simple_answer:
|
| 199 |
+
logger.info("Extracted simple answer from question")
|
| 200 |
+
return simple_answer
|
| 201 |
+
|
| 202 |
+
# Strategy 5: Last resort - return a default answer
|
| 203 |
+
logger.warning("Could not solve question, using default answer")
|
| 204 |
+
return "answer"
|
| 205 |
+
|
| 206 |
+
def _extract_data_from_page(self, page_content: Dict[str, Any]) -> Dict[str, Any]:
|
| 207 |
+
"""
|
| 208 |
+
Extract structured data from page.
|
| 209 |
+
|
| 210 |
+
Args:
|
| 211 |
+
page_content: Page content dictionary
|
| 212 |
+
|
| 213 |
+
Returns:
|
| 214 |
+
Dictionary of extracted data
|
| 215 |
+
"""
|
| 216 |
+
data = {
|
| 217 |
+
'text': page_content.get('text', ''),
|
| 218 |
+
'html': page_content.get('html', ''),
|
| 219 |
+
'links': page_content.get('links', []),
|
| 220 |
+
'images': page_content.get('images', []),
|
| 221 |
+
}
|
| 222 |
+
|
| 223 |
+
# Try to extract tables
|
| 224 |
+
try:
|
| 225 |
+
soup = BeautifulSoup(page_content.get('html', ''), 'html.parser')
|
| 226 |
+
tables = soup.find_all('table')
|
| 227 |
+
if tables:
|
| 228 |
+
data['tables'] = []
|
| 229 |
+
for table in tables:
|
| 230 |
+
try:
|
| 231 |
+
df = pd.read_html(str(table))[0]
|
| 232 |
+
data['tables'].append(df.to_dict('records'))
|
| 233 |
+
except:
|
| 234 |
+
pass
|
| 235 |
+
except Exception as e:
|
| 236 |
+
logger.warning(f"Error extracting tables: {e}")
|
| 237 |
+
|
| 238 |
+
# Try to extract JSON from page
|
| 239 |
+
json_data = extract_json_from_text(page_content.get('text', ''))
|
| 240 |
+
if json_data:
|
| 241 |
+
data['json'] = json_data
|
| 242 |
+
|
| 243 |
+
return data
|
| 244 |
+
|
| 245 |
+
def _find_answer_in_page(self, page_content: Dict[str, Any], question: str) -> Optional[Any]:
|
| 246 |
+
"""
|
| 247 |
+
Check if answer is already present in page content.
|
| 248 |
+
|
| 249 |
+
Args:
|
| 250 |
+
page_content: Page content
|
| 251 |
+
question: Question text
|
| 252 |
+
|
| 253 |
+
Returns:
|
| 254 |
+
Answer if found, None otherwise
|
| 255 |
+
"""
|
| 256 |
+
text = page_content.get('all_text', page_content.get('text', ''))
|
| 257 |
+
|
| 258 |
+
# Look for answer patterns
|
| 259 |
+
answer_patterns = [
|
| 260 |
+
r'[Aa]nswer[:\s]+(.*?)(?:\n\n|$)',
|
| 261 |
+
r'[Ss]olution[:\s]+(.*?)(?:\n\n|$)',
|
| 262 |
+
r'[Rr]esult[:\s]+(.*?)(?:\n\n|$)',
|
| 263 |
+
]
|
| 264 |
+
|
| 265 |
+
for pattern in answer_patterns:
|
| 266 |
+
match = re.search(pattern, text, re.DOTALL | re.IGNORECASE)
|
| 267 |
+
if match:
|
| 268 |
+
answer_text = clean_text(match.group(1))
|
| 269 |
+
# Try to parse as JSON
|
| 270 |
+
json_answer = extract_json_from_text(answer_text)
|
| 271 |
+
if json_answer:
|
| 272 |
+
return json_answer
|
| 273 |
+
return answer_text
|
| 274 |
+
|
| 275 |
+
return None
|
| 276 |
+
|
| 277 |
+
def _find_data_files(self, page_content: Dict[str, Any]) -> List[str]:
|
| 278 |
+
"""
|
| 279 |
+
Find data files (CSV, JSON, PDF, etc.) linked in the page.
|
| 280 |
+
|
| 281 |
+
Args:
|
| 282 |
+
page_content: Page content
|
| 283 |
+
|
| 284 |
+
Returns:
|
| 285 |
+
List of file URLs
|
| 286 |
+
"""
|
| 287 |
+
files = []
|
| 288 |
+
|
| 289 |
+
# Check links
|
| 290 |
+
for link in page_content.get('links', []):
|
| 291 |
+
href = link.get('href', '')
|
| 292 |
+
if any(href.lower().endswith(ext) for ext in ['.csv', '.json', '.pdf', '.xlsx', '.txt']):
|
| 293 |
+
files.append(href)
|
| 294 |
+
|
| 295 |
+
# Check text for file URLs
|
| 296 |
+
text = page_content.get('text', '')
|
| 297 |
+
file_pattern = r'https?://[^\s<>"\'\)]+\.(csv|json|pdf|xlsx|txt)'
|
| 298 |
+
matches = re.findall(file_pattern, text, re.IGNORECASE)
|
| 299 |
+
files.extend([m[0] for m in matches if m[0] not in files])
|
| 300 |
+
|
| 301 |
+
return files
|
| 302 |
+
|
| 303 |
+
async def _process_data_files(self, file_urls: List[str]) -> Dict[str, Any]:
|
| 304 |
+
"""
|
| 305 |
+
Download and process data files.
|
| 306 |
+
|
| 307 |
+
Args:
|
| 308 |
+
file_urls: List of file URLs
|
| 309 |
+
|
| 310 |
+
Returns:
|
| 311 |
+
Dictionary of processed data
|
| 312 |
+
"""
|
| 313 |
+
processed = {}
|
| 314 |
+
|
| 315 |
+
for url in file_urls:
|
| 316 |
+
try:
|
| 317 |
+
logger.info(f"Downloading file: {url}")
|
| 318 |
+
response = requests.get(url, timeout=30)
|
| 319 |
+
response.raise_for_status()
|
| 320 |
+
|
| 321 |
+
content_type = response.headers.get('content-type', '').lower()
|
| 322 |
+
filename = url.split('/')[-1]
|
| 323 |
+
|
| 324 |
+
if 'csv' in content_type or filename.endswith('.csv'):
|
| 325 |
+
df = pd.read_csv(io.StringIO(response.text))
|
| 326 |
+
processed[filename] = df.to_dict('records')
|
| 327 |
+
|
| 328 |
+
elif 'json' in content_type or filename.endswith('.json'):
|
| 329 |
+
processed[filename] = response.json()
|
| 330 |
+
|
| 331 |
+
elif 'pdf' in content_type or filename.endswith('.pdf'):
|
| 332 |
+
# PDF processing - try pdfplumber first, then PyPDF2
|
| 333 |
+
text = None
|
| 334 |
+
|
| 335 |
+
# Try pdfplumber
|
| 336 |
+
try:
|
| 337 |
+
import pdfplumber
|
| 338 |
+
with pdfplumber.open(io.BytesIO(response.content)) as pdf:
|
| 339 |
+
text = ""
|
| 340 |
+
for page in pdf.pages:
|
| 341 |
+
page_text = page.extract_text()
|
| 342 |
+
if page_text:
|
| 343 |
+
text += page_text + "\n"
|
| 344 |
+
if text:
|
| 345 |
+
processed[filename] = text.strip()
|
| 346 |
+
except ImportError:
|
| 347 |
+
logger.debug("pdfplumber not available")
|
| 348 |
+
except Exception as e:
|
| 349 |
+
logger.warning(f"Error reading PDF with pdfplumber {filename}: {e}")
|
| 350 |
+
|
| 351 |
+
# Fallback to PyPDF2
|
| 352 |
+
if not text or filename not in processed:
|
| 353 |
+
try:
|
| 354 |
+
import PyPDF2
|
| 355 |
+
pdf_file = io.BytesIO(response.content)
|
| 356 |
+
pdf_reader = PyPDF2.PdfReader(pdf_file)
|
| 357 |
+
text = ""
|
| 358 |
+
for page in pdf_reader.pages:
|
| 359 |
+
page_text = page.extract_text()
|
| 360 |
+
if page_text:
|
| 361 |
+
text += page_text + "\n"
|
| 362 |
+
if text:
|
| 363 |
+
processed[filename] = text.strip()
|
| 364 |
+
except ImportError:
|
| 365 |
+
logger.warning("Neither pdfplumber nor PyPDF2 available for PDF processing")
|
| 366 |
+
except Exception as e:
|
| 367 |
+
logger.warning(f"Error reading PDF with PyPDF2 {filename}: {e}")
|
| 368 |
+
|
| 369 |
+
elif filename.endswith('.txt'):
|
| 370 |
+
processed[filename] = response.text
|
| 371 |
+
|
| 372 |
+
except Exception as e:
|
| 373 |
+
logger.error(f"Error processing file {url}: {e}")
|
| 374 |
+
continue
|
| 375 |
+
|
| 376 |
+
return processed
|
| 377 |
+
|
| 378 |
+
def _normalize_answer(self, answer: Any) -> Any:
|
| 379 |
+
"""
|
| 380 |
+
Normalize answer to ensure it's JSON-serializable and in correct format.
|
| 381 |
+
|
| 382 |
+
Args:
|
| 383 |
+
answer: Raw answer (can be dict, list, string, etc.)
|
| 384 |
+
|
| 385 |
+
Returns:
|
| 386 |
+
Normalized answer (preferably string or simple JSON)
|
| 387 |
+
"""
|
| 388 |
+
if answer is None:
|
| 389 |
+
return "answer"
|
| 390 |
+
|
| 391 |
+
# If it's a dict with question/analysis, extract a simple answer
|
| 392 |
+
if isinstance(answer, dict):
|
| 393 |
+
# If it contains an 'answer' key, use that
|
| 394 |
+
if 'answer' in answer:
|
| 395 |
+
return self._normalize_answer(answer['answer'])
|
| 396 |
+
# If it's an analysis dict, try to extract something useful
|
| 397 |
+
if 'question' in answer and len(answer) > 1:
|
| 398 |
+
# Return a simple string instead of the whole dict
|
| 399 |
+
return "answer"
|
| 400 |
+
# If it's a simple dict, convert to JSON string
|
| 401 |
+
if len(answer) <= 3:
|
| 402 |
+
try:
|
| 403 |
+
return json.dumps(answer)
|
| 404 |
+
except:
|
| 405 |
+
return str(answer)
|
| 406 |
+
# Complex dict - return as JSON string
|
| 407 |
+
try:
|
| 408 |
+
return json.dumps(answer)
|
| 409 |
+
except:
|
| 410 |
+
return str(answer)
|
| 411 |
+
|
| 412 |
+
# If it's a list, convert to JSON string if small, otherwise string
|
| 413 |
+
if isinstance(answer, list):
|
| 414 |
+
if len(answer) <= 10:
|
| 415 |
+
try:
|
| 416 |
+
return json.dumps(answer)
|
| 417 |
+
except:
|
| 418 |
+
return str(answer)
|
| 419 |
+
return str(answer)
|
| 420 |
+
|
| 421 |
+
# For strings, return as-is (but clean up)
|
| 422 |
+
if isinstance(answer, str):
|
| 423 |
+
# Remove excessive whitespace
|
| 424 |
+
answer = ' '.join(answer.split())
|
| 425 |
+
# If it's very long, truncate
|
| 426 |
+
if len(answer) > 1000:
|
| 427 |
+
answer = answer[:1000] + "..."
|
| 428 |
+
return answer
|
| 429 |
+
|
| 430 |
+
# For other types, convert to string
|
| 431 |
+
return str(answer)
|
| 432 |
+
|
| 433 |
+
def _extract_simple_answer(self, question: str, page_content: Dict[str, Any]) -> Optional[str]:
|
| 434 |
+
"""
|
| 435 |
+
Try to extract a simple answer from the question or page.
|
| 436 |
+
|
| 437 |
+
Args:
|
| 438 |
+
question: Question text
|
| 439 |
+
page_content: Page content
|
| 440 |
+
|
| 441 |
+
Returns:
|
| 442 |
+
Simple answer string or None
|
| 443 |
+
"""
|
| 444 |
+
text = page_content.get('all_text', page_content.get('text', ''))
|
| 445 |
+
combined = question + "\n\n" + text
|
| 446 |
+
|
| 447 |
+
# Check if question says "anything" or similar - very common in demo quizzes
|
| 448 |
+
if re.search(r'"answer"\s*:\s*"anything\s+you\s+want"', combined, re.IGNORECASE):
|
| 449 |
+
return "answer"
|
| 450 |
+
if re.search(r'"answer"\s*:\s*"anything"', combined, re.IGNORECASE):
|
| 451 |
+
return "answer"
|
| 452 |
+
if re.search(r'anything\s+you\s+want|any\s+value|any\s+string|any\s+text|anything', question, re.IGNORECASE):
|
| 453 |
+
return "answer"
|
| 454 |
+
|
| 455 |
+
# Look for patterns like "answer: X" or "the answer is X"
|
| 456 |
+
patterns = [
|
| 457 |
+
r'"answer"\s*:\s*"([^"]+)"', # JSON format: "answer": "value"
|
| 458 |
+
r'[Aa]nswer[:\s]+["\']?([^"\'\n]+)["\']?',
|
| 459 |
+
r'[Tt]he\s+[Aa]nswer\s+[Ii]s[:\s]+["\']?([^"\'\n]+)["\']?',
|
| 460 |
+
r'[Yy]our\s+[Aa]nswer[:\s]+["\']?([^"\'\n]+)["\']?',
|
| 461 |
+
]
|
| 462 |
+
|
| 463 |
+
for pattern in patterns:
|
| 464 |
+
match = re.search(pattern, combined, re.IGNORECASE)
|
| 465 |
+
if match:
|
| 466 |
+
answer = match.group(1).strip()
|
| 467 |
+
# Skip if it's a placeholder or instruction
|
| 468 |
+
if answer and len(answer) < 200 and answer.lower() not in ['your email', 'your secret', 'anything you want', 'anything']:
|
| 469 |
+
return answer
|
| 470 |
+
|
| 471 |
+
return None
|
| 472 |
+
|
| 473 |
+
async def _solve_with_data(self, question: str, data: Dict[str, Any]) -> Optional[Any]:
|
| 474 |
+
"""
|
| 475 |
+
Solve question using processed data.
|
| 476 |
+
|
| 477 |
+
Args:
|
| 478 |
+
question: Question text
|
| 479 |
+
data: Processed data dictionary
|
| 480 |
+
|
| 481 |
+
Returns:
|
| 482 |
+
Answer or None
|
| 483 |
+
"""
|
| 484 |
+
# Use LLM to solve with data
|
| 485 |
+
prompt = f"""Solve this question using the provided data:
|
| 486 |
+
|
| 487 |
+
Question: {question}
|
| 488 |
+
|
| 489 |
+
Data:
|
| 490 |
+
{json.dumps(data, indent=2, default=str)}
|
| 491 |
+
|
| 492 |
+
Provide the answer. If JSON format is required, return valid JSON.
|
| 493 |
+
"""
|
| 494 |
+
|
| 495 |
+
answer = await ask_gpt(prompt, max_tokens=3000)
|
| 496 |
+
if answer:
|
| 497 |
+
json_answer = extract_json_from_text(answer)
|
| 498 |
+
if json_answer:
|
| 499 |
+
return json_answer
|
| 500 |
+
return answer
|
| 501 |
+
|
| 502 |
+
return None
|
| 503 |
+
|
| 504 |
+
async def _submit_answer(self, submit_url: str, email: str, secret: str,
|
| 505 |
+
quiz_url: str, answer: Any) -> Dict[str, Any]:
|
| 506 |
+
"""
|
| 507 |
+
Submit answer to the quiz system.
|
| 508 |
+
|
| 509 |
+
Args:
|
| 510 |
+
submit_url: URL to submit answer to
|
| 511 |
+
email: User email
|
| 512 |
+
secret: Secret key
|
| 513 |
+
quiz_url: Original quiz URL
|
| 514 |
+
answer: Computed answer
|
| 515 |
+
|
| 516 |
+
Returns:
|
| 517 |
+
Response from submission endpoint
|
| 518 |
+
"""
|
| 519 |
+
# Ensure answer is JSON-serializable
|
| 520 |
+
try:
|
| 521 |
+
# Try to serialize answer to check if it's valid JSON
|
| 522 |
+
json.dumps(answer)
|
| 523 |
+
except (TypeError, ValueError) as e:
|
| 524 |
+
logger.warning(f"Answer is not JSON-serializable, converting to string: {e}")
|
| 525 |
+
# Convert complex objects to string representation
|
| 526 |
+
if isinstance(answer, (dict, list)):
|
| 527 |
+
answer = json.dumps(answer)
|
| 528 |
+
else:
|
| 529 |
+
answer = str(answer)
|
| 530 |
+
|
| 531 |
+
payload = {
|
| 532 |
+
"email": email,
|
| 533 |
+
"secret": secret,
|
| 534 |
+
"url": quiz_url,
|
| 535 |
+
"answer": answer
|
| 536 |
+
}
|
| 537 |
+
|
| 538 |
+
try:
|
| 539 |
+
logger.info(f"Submitting answer to: {submit_url}")
|
| 540 |
+
logger.debug(f"Payload: {json.dumps(payload, indent=2, default=str)}")
|
| 541 |
+
|
| 542 |
+
response = requests.post(
|
| 543 |
+
submit_url,
|
| 544 |
+
json=payload,
|
| 545 |
+
headers={'Content-Type': 'application/json'},
|
| 546 |
+
timeout=60
|
| 547 |
+
)
|
| 548 |
+
|
| 549 |
+
# Log response details
|
| 550 |
+
logger.info(f"Response status: {response.status_code}")
|
| 551 |
+
logger.debug(f"Response headers: {dict(response.headers)}")
|
| 552 |
+
|
| 553 |
+
response.raise_for_status()
|
| 554 |
+
|
| 555 |
+
try:
|
| 556 |
+
result = response.json()
|
| 557 |
+
logger.info(f"Submission successful: {result}")
|
| 558 |
+
return result
|
| 559 |
+
except json.JSONDecodeError:
|
| 560 |
+
logger.warning(f"Response is not JSON, returning text: {response.text[:500]}")
|
| 561 |
+
return {"response": response.text, "status_code": response.status_code}
|
| 562 |
+
|
| 563 |
+
except requests.exceptions.HTTPError as e:
|
| 564 |
+
logger.error(f"HTTP error submitting answer: {e}")
|
| 565 |
+
if hasattr(e, 'response') and e.response is not None:
|
| 566 |
+
try:
|
| 567 |
+
error_response = e.response.json()
|
| 568 |
+
logger.error(f"Error response: {error_response}")
|
| 569 |
+
return error_response
|
| 570 |
+
except:
|
| 571 |
+
logger.error(f"Error response text: {e.response.text[:500]}")
|
| 572 |
+
return {"error": e.response.text, "status_code": e.response.status_code}
|
| 573 |
+
return {"error": str(e)}
|
| 574 |
+
except requests.exceptions.RequestException as e:
|
| 575 |
+
logger.error(f"Error submitting answer: {e}", exc_info=True)
|
| 576 |
+
return {"error": str(e)}
|
| 577 |
+
|
| 578 |
+
|
| 579 |
+
async def solve_quiz(url: str, email: str, secret: str) -> Dict[str, Any]:
|
| 580 |
+
"""
|
| 581 |
+
Convenience function to solve a quiz.
|
| 582 |
+
|
| 583 |
+
Args:
|
| 584 |
+
url: Quiz page URL
|
| 585 |
+
email: User email
|
| 586 |
+
secret: Secret key
|
| 587 |
+
|
| 588 |
+
Returns:
|
| 589 |
+
Final response from quiz system
|
| 590 |
+
"""
|
| 591 |
+
solver = QuizSolver()
|
| 592 |
+
return await solver.solve_quiz(url, email, secret)
|
| 593 |
+
|
app/utils.py
ADDED
|
@@ -0,0 +1,180 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Utility functions for the IITM LLM Quiz Solver.
|
| 3 |
+
"""
|
| 4 |
+
import re
|
| 5 |
+
import json
|
| 6 |
+
import logging
|
| 7 |
+
from typing import Optional, Dict, Any
|
| 8 |
+
from urllib.parse import urlparse, urljoin
|
| 9 |
+
|
| 10 |
+
# Configure logging
|
| 11 |
+
logging.basicConfig(
|
| 12 |
+
level=logging.INFO,
|
| 13 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
| 14 |
+
)
|
| 15 |
+
logger = logging.getLogger(__name__)
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def extract_submit_url(text: str, base_url: str) -> Optional[str]:
|
| 19 |
+
"""
|
| 20 |
+
Extract submit URL from page text.
|
| 21 |
+
|
| 22 |
+
Looks for patterns like:
|
| 23 |
+
- "Submit your answer to: https://example.com/submit"
|
| 24 |
+
- "Submit to: https://example.com/submit"
|
| 25 |
+
- "URL: https://example.com/submit"
|
| 26 |
+
|
| 27 |
+
Args:
|
| 28 |
+
text: The page text content
|
| 29 |
+
base_url: Base URL for relative URL resolution
|
| 30 |
+
|
| 31 |
+
Returns:
|
| 32 |
+
Extracted submit URL or None
|
| 33 |
+
"""
|
| 34 |
+
# Common patterns for submit URLs
|
| 35 |
+
patterns = [
|
| 36 |
+
r'[Ss]ubmit\s+(?:your\s+)?(?:answer\s+)?(?:to|at|via):\s*(https?://[^\s<>"\'\)]+)',
|
| 37 |
+
r'[Ss]ubmit\s+[Tt]o:\s*(https?://[^\s<>"\'\)]+)',
|
| 38 |
+
r'[Uu][Rr][Ll]:\s*(https?://[^\s<>"\'\)]+)',
|
| 39 |
+
r'[Pp]ost\s+(?:to|at):\s*(https?://[^\s<>"\'\)]+)',
|
| 40 |
+
r'[Ss]end\s+(?:to|at):\s*(https?://[^\s<>"\'\)]+)',
|
| 41 |
+
r'(https?://[^\s<>"\'\)]*submit[^\s<>"\'\)]*)',
|
| 42 |
+
r'(https?://[^\s<>"\'\)]*answer[^\s<>"\'\)]*)',
|
| 43 |
+
]
|
| 44 |
+
|
| 45 |
+
for pattern in patterns:
|
| 46 |
+
matches = re.findall(pattern, text, re.IGNORECASE)
|
| 47 |
+
if matches:
|
| 48 |
+
url = matches[0].strip().rstrip('.,;:!?)}]{["\'')
|
| 49 |
+
# Validate URL
|
| 50 |
+
try:
|
| 51 |
+
parsed = urlparse(url)
|
| 52 |
+
if parsed.scheme and parsed.netloc:
|
| 53 |
+
logger.info(f"Found submit URL: {url}")
|
| 54 |
+
return url
|
| 55 |
+
except Exception as e:
|
| 56 |
+
logger.warning(f"Invalid URL pattern found: {url}, error: {e}")
|
| 57 |
+
continue
|
| 58 |
+
|
| 59 |
+
# Try to find any URL that might be a submit endpoint
|
| 60 |
+
url_pattern = r'https?://[^\s<>"\'\)]+'
|
| 61 |
+
all_urls = re.findall(url_pattern, text)
|
| 62 |
+
for url in all_urls:
|
| 63 |
+
url_lower = url.lower()
|
| 64 |
+
if 'submit' in url_lower or 'answer' in url_lower:
|
| 65 |
+
try:
|
| 66 |
+
parsed = urlparse(url)
|
| 67 |
+
if parsed.scheme and parsed.netloc:
|
| 68 |
+
logger.info(f"Found potential submit URL: {url}")
|
| 69 |
+
return url
|
| 70 |
+
except:
|
| 71 |
+
continue
|
| 72 |
+
|
| 73 |
+
# Try to find relative submit links (e.g. href="/submit")
|
| 74 |
+
rel_patterns = [
|
| 75 |
+
r'href=["\\\'](/[^"\\\']*submit[^"\\\']*)["\\\']',
|
| 76 |
+
r'(/[^\\s"<>\']*submit[^\\s"<>\']*)',
|
| 77 |
+
]
|
| 78 |
+
for pattern in rel_patterns:
|
| 79 |
+
matches = re.findall(pattern, text, re.IGNORECASE)
|
| 80 |
+
if matches:
|
| 81 |
+
candidate = matches[0].strip().rstrip('.,;:!?)}]{["\'')
|
| 82 |
+
joined = urljoin(base_url, candidate)
|
| 83 |
+
logger.info(f"Found relative submit URL: {joined}")
|
| 84 |
+
return joined
|
| 85 |
+
|
| 86 |
+
logger.warning("No submit URL found in page text")
|
| 87 |
+
return None
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
def validate_secret(secret: str, expected_secret: str) -> bool:
|
| 91 |
+
"""
|
| 92 |
+
Validate the secret key.
|
| 93 |
+
|
| 94 |
+
Args:
|
| 95 |
+
secret: Provided secret
|
| 96 |
+
expected_secret: Expected secret from environment
|
| 97 |
+
|
| 98 |
+
Returns:
|
| 99 |
+
True if valid, False otherwise
|
| 100 |
+
"""
|
| 101 |
+
return secret == expected_secret
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
def clean_text(text: str) -> str:
|
| 105 |
+
"""
|
| 106 |
+
Clean and normalize text content.
|
| 107 |
+
|
| 108 |
+
Args:
|
| 109 |
+
text: Raw text content
|
| 110 |
+
|
| 111 |
+
Returns:
|
| 112 |
+
Cleaned text
|
| 113 |
+
"""
|
| 114 |
+
if not text:
|
| 115 |
+
return ""
|
| 116 |
+
|
| 117 |
+
# Remove excessive whitespace
|
| 118 |
+
text = re.sub(r'\s+', ' ', text)
|
| 119 |
+
# Remove leading/trailing whitespace
|
| 120 |
+
text = text.strip()
|
| 121 |
+
|
| 122 |
+
return text
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
def extract_json_from_text(text: str) -> Optional[Dict[str, Any]]:
|
| 126 |
+
"""
|
| 127 |
+
Try to extract JSON objects from text.
|
| 128 |
+
|
| 129 |
+
Args:
|
| 130 |
+
text: Text that may contain JSON
|
| 131 |
+
|
| 132 |
+
Returns:
|
| 133 |
+
Parsed JSON dict or None
|
| 134 |
+
"""
|
| 135 |
+
# Try to find JSON blocks
|
| 136 |
+
json_pattern = r'\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}'
|
| 137 |
+
matches = re.findall(json_pattern, text, re.DOTALL)
|
| 138 |
+
|
| 139 |
+
for match in matches:
|
| 140 |
+
try:
|
| 141 |
+
return json.loads(match)
|
| 142 |
+
except json.JSONDecodeError:
|
| 143 |
+
continue
|
| 144 |
+
|
| 145 |
+
return None
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
def is_valid_url(url: str) -> bool:
|
| 149 |
+
"""
|
| 150 |
+
Validate if a string is a valid URL.
|
| 151 |
+
|
| 152 |
+
Args:
|
| 153 |
+
url: URL string to validate
|
| 154 |
+
|
| 155 |
+
Returns:
|
| 156 |
+
True if valid URL, False otherwise
|
| 157 |
+
"""
|
| 158 |
+
try:
|
| 159 |
+
result = urlparse(url)
|
| 160 |
+
return all([result.scheme, result.netloc])
|
| 161 |
+
except Exception:
|
| 162 |
+
return False
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
def sanitize_filename(filename: str) -> str:
|
| 166 |
+
"""
|
| 167 |
+
Sanitize a filename by removing invalid characters.
|
| 168 |
+
|
| 169 |
+
Args:
|
| 170 |
+
filename: Original filename
|
| 171 |
+
|
| 172 |
+
Returns:
|
| 173 |
+
Sanitized filename
|
| 174 |
+
"""
|
| 175 |
+
# Remove invalid characters
|
| 176 |
+
filename = re.sub(r'[<>:"/\\|?*]', '_', filename)
|
| 177 |
+
# Remove leading/trailing dots and spaces
|
| 178 |
+
filename = filename.strip('. ')
|
| 179 |
+
return filename
|
| 180 |
+
|