File size: 22,554 Bytes
2f95553
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
"""

Quiz solver module - main logic for solving quizzes.

"""
import asyncio
import json
import logging
import re
from typing import Optional, Dict, Any, List
import requests
from bs4 import BeautifulSoup
import pandas as pd
import io
import base64

from app.browser import get_browser, cleanup_browser
from app.llm import ask_gpt, parse_question_with_llm, solve_with_llm, initialize_llm
from app.utils import extract_submit_url, clean_text, extract_json_from_text, is_valid_url

logger = logging.getLogger(__name__)

# Initialize LLM on module load
initialize_llm()


class QuizSolver:
    """Main quiz solver class."""
    
    def __init__(self):
        self.browser = None
        self.max_recursion = 10
        self.current_recursion = 0
    
    async def solve_quiz(self, url: str, email: str, secret: str) -> Dict[str, Any]:
        """

        Main entry point for solving a quiz.

        

        Args:

            url: Quiz page URL

            email: User email

            secret: Secret key

            

        Returns:

            Final response from quiz system

        """
        self.current_recursion = 0
        self.browser = await get_browser()
        
        try:
            return await self._solve_recursive(url, email, secret)
        finally:
            # Don't close browser here as it might be reused
            pass
    
    async def _solve_recursive(self, url: str, email: str, secret: str) -> Dict[str, Any]:
        """

        Recursively solve quizzes.

        

        Args:

            url: Current quiz URL

            email: User email

            secret: Secret key

            

        Returns:

            Response from quiz system

        """
        if self.current_recursion >= self.max_recursion:
            logger.error("Maximum recursion depth reached")
            return {"error": "Maximum recursion depth reached"}
        
        self.current_recursion += 1
        logger.info(f"Solving quiz {self.current_recursion}: {url}")
        
        try:
            # Load the quiz page
            page_content = await self.browser.load_page(url, wait_time=3)
            
            # Extract submit URL
            submit_url = extract_submit_url(page_content['text'], url)
            if not submit_url:
                # Try from HTML
                soup = BeautifulSoup(page_content['html'], 'html.parser')
                submit_url = extract_submit_url(soup.get_text(), url)
            
            if not submit_url:
                logger.error("Could not find submit URL")
                return {"error": "Submit URL not found"}
            
            # Extract question and solve
            question_text = self._extract_question(page_content)
            logger.info(f"Question extracted: {question_text[:200]}...")
            
            # Solve the question
            answer = await self._solve_question(question_text, page_content)
            
            # Ensure answer is in the correct format (string or simple JSON-serializable)
            answer = self._normalize_answer(answer)
            logger.info(f"Answer computed: {str(answer)[:200]}...")
            
            # Submit answer
            response = await self._submit_answer(
                submit_url, email, secret, url, answer
            )
            
            # Check if there's a next quiz
            if isinstance(response, dict) and 'url' in response:
                next_url = response['url']
                if next_url and next_url != url and is_valid_url(next_url):
                    logger.info(f"Next quiz found: {next_url}")
                    # Recursively solve next quiz
                    next_response = await self._solve_recursive(next_url, email, secret)
                    return next_response
            
            return response
            
        except Exception as e:
            logger.error(f"Error solving quiz: {e}", exc_info=True)
            return {"error": str(e)}
    
    def _extract_question(self, page_content: Dict[str, Any]) -> str:
        """

        Extract question text from page content.

        

        Args:

            page_content: Page content dictionary

            

        Returns:

            Question text

        """
        text = page_content.get('all_text', page_content.get('text', ''))
        
        # Try to find question markers
        question_patterns = [
            r'[Qq]uestion[:\s]+(.*?)(?:\n\n|\n[A-Z]|$)',
            r'[Pp]roblem[:\s]+(.*?)(?:\n\n|\n[A-Z]|$)',
            r'[Tt]ask[:\s]+(.*?)(?:\n\n|\n[A-Z]|$)',
        ]
        
        for pattern in question_patterns:
            match = re.search(pattern, text, re.DOTALL | re.IGNORECASE)
            if match:
                return clean_text(match.group(1))
        
        # If no pattern matches, return first substantial paragraph
        paragraphs = [p.strip() for p in text.split('\n\n') if len(p.strip()) > 50]
        if paragraphs:
            return paragraphs[0]
        
        return clean_text(text[:1000])  # Return first 1000 chars
    
    async def _solve_question(self, question: str, page_content: Dict[str, Any]) -> Any:
        """

        Solve a quiz question using various strategies.

        

        Args:

            question: Question text

            page_content: Full page content

            

        Returns:

            Answer (can be dict, list, string, number, etc.)

        """
        logger.info("Analyzing question type...")
        
        # Try to parse question with LLM first
        parsed = await parse_question_with_llm(question, page_content.get('text', ''))
        
        # Extract data from page
        available_data = self._extract_data_from_page(page_content)
        
        # Strategy 1: Check if answer is already in the page
        answer_in_page = self._find_answer_in_page(page_content, question)
        if answer_in_page:
            logger.info("Answer found in page content")
            return answer_in_page
        
        # Strategy 2: Check for data files/links to download
        data_files = self._find_data_files(page_content)
        if data_files:
            logger.info(f"Found data files: {data_files}")
            processed_data = await self._process_data_files(data_files)
            if processed_data:
                answer = await self._solve_with_data(question, processed_data)
                if answer:
                    return answer
        
        # Strategy 3: Use LLM to solve
        logger.info("Attempting to solve with LLM...")
        llm_answer = await solve_with_llm(question, available_data)
        if llm_answer:
            # Try to parse as JSON if it looks like JSON
            json_answer = extract_json_from_text(llm_answer)
            if json_answer:
                return json_answer
            return llm_answer
        
        # Strategy 4: Fallback - try to extract a simple answer from the question
        # Many quiz pages have the answer in the question itself
        simple_answer = self._extract_simple_answer(question, page_content)
        if simple_answer:
            logger.info("Extracted simple answer from question")
            return simple_answer
        
        # Strategy 5: Last resort - return a default answer
        logger.warning("Could not solve question, using default answer")
        return "answer"
    
    def _extract_data_from_page(self, page_content: Dict[str, Any]) -> Dict[str, Any]:
        """

        Extract structured data from page.

        

        Args:

            page_content: Page content dictionary

            

        Returns:

            Dictionary of extracted data

        """
        data = {
            'text': page_content.get('text', ''),
            'html': page_content.get('html', ''),
            'links': page_content.get('links', []),
            'images': page_content.get('images', []),
        }
        
        # Try to extract tables
        try:
            soup = BeautifulSoup(page_content.get('html', ''), 'html.parser')
            tables = soup.find_all('table')
            if tables:
                data['tables'] = []
                for table in tables:
                    try:
                        df = pd.read_html(str(table))[0]
                        data['tables'].append(df.to_dict('records'))
                    except:
                        pass
        except Exception as e:
            logger.warning(f"Error extracting tables: {e}")
        
        # Try to extract JSON from page
        json_data = extract_json_from_text(page_content.get('text', ''))
        if json_data:
            data['json'] = json_data
        
        return data
    
    def _find_answer_in_page(self, page_content: Dict[str, Any], question: str) -> Optional[Any]:
        """

        Check if answer is already present in page content.

        

        Args:

            page_content: Page content

            question: Question text

            

        Returns:

            Answer if found, None otherwise

        """
        text = page_content.get('all_text', page_content.get('text', ''))
        
        # Look for answer patterns
        answer_patterns = [
            r'[Aa]nswer[:\s]+(.*?)(?:\n\n|$)',
            r'[Ss]olution[:\s]+(.*?)(?:\n\n|$)',
            r'[Rr]esult[:\s]+(.*?)(?:\n\n|$)',
        ]
        
        for pattern in answer_patterns:
            match = re.search(pattern, text, re.DOTALL | re.IGNORECASE)
            if match:
                answer_text = clean_text(match.group(1))
                # Try to parse as JSON
                json_answer = extract_json_from_text(answer_text)
                if json_answer:
                    return json_answer
                return answer_text
        
        return None
    
    def _find_data_files(self, page_content: Dict[str, Any]) -> List[str]:
        """

        Find data files (CSV, JSON, PDF, etc.) linked in the page.

        

        Args:

            page_content: Page content

            

        Returns:

            List of file URLs

        """
        files = []
        
        # Check links
        for link in page_content.get('links', []):
            href = link.get('href', '')
            if any(href.lower().endswith(ext) for ext in ['.csv', '.json', '.pdf', '.xlsx', '.txt']):
                files.append(href)
        
        # Check text for file URLs
        text = page_content.get('text', '')
        file_pattern = r'https?://[^\s<>"\'\)]+\.(csv|json|pdf|xlsx|txt)'
        matches = re.findall(file_pattern, text, re.IGNORECASE)
        files.extend([m[0] for m in matches if m[0] not in files])
        
        return files
    
    async def _process_data_files(self, file_urls: List[str]) -> Dict[str, Any]:
        """

        Download and process data files.

        

        Args:

            file_urls: List of file URLs

            

        Returns:

            Dictionary of processed data

        """
        processed = {}
        
        for url in file_urls:
            try:
                logger.info(f"Downloading file: {url}")
                response = requests.get(url, timeout=30)
                response.raise_for_status()
                
                content_type = response.headers.get('content-type', '').lower()
                filename = url.split('/')[-1]
                
                if 'csv' in content_type or filename.endswith('.csv'):
                    df = pd.read_csv(io.StringIO(response.text))
                    processed[filename] = df.to_dict('records')
                    
                elif 'json' in content_type or filename.endswith('.json'):
                    processed[filename] = response.json()
                    
                elif 'pdf' in content_type or filename.endswith('.pdf'):
                    # PDF processing - try pdfplumber first, then PyPDF2
                    text = None
                    
                    # Try pdfplumber
                    try:
                        import pdfplumber
                        with pdfplumber.open(io.BytesIO(response.content)) as pdf:
                            text = ""
                            for page in pdf.pages:
                                page_text = page.extract_text()
                                if page_text:
                                    text += page_text + "\n"
                        if text:
                            processed[filename] = text.strip()
                    except ImportError:
                        logger.debug("pdfplumber not available")
                    except Exception as e:
                        logger.warning(f"Error reading PDF with pdfplumber {filename}: {e}")
                    
                    # Fallback to PyPDF2
                    if not text or filename not in processed:
                        try:
                            import PyPDF2
                            pdf_file = io.BytesIO(response.content)
                            pdf_reader = PyPDF2.PdfReader(pdf_file)
                            text = ""
                            for page in pdf_reader.pages:
                                page_text = page.extract_text()
                                if page_text:
                                    text += page_text + "\n"
                            if text:
                                processed[filename] = text.strip()
                        except ImportError:
                            logger.warning("Neither pdfplumber nor PyPDF2 available for PDF processing")
                        except Exception as e:
                            logger.warning(f"Error reading PDF with PyPDF2 {filename}: {e}")
                
                elif filename.endswith('.txt'):
                    processed[filename] = response.text
                    
            except Exception as e:
                logger.error(f"Error processing file {url}: {e}")
                continue
        
        return processed
    
    def _normalize_answer(self, answer: Any) -> Any:
        """

        Normalize answer to ensure it's JSON-serializable and in correct format.

        

        Args:

            answer: Raw answer (can be dict, list, string, etc.)

            

        Returns:

            Normalized answer (preferably string or simple JSON)

        """
        if answer is None:
            return "answer"
        
        # If it's a dict with question/analysis, extract a simple answer
        if isinstance(answer, dict):
            # If it contains an 'answer' key, use that
            if 'answer' in answer:
                return self._normalize_answer(answer['answer'])
            # If it's an analysis dict, try to extract something useful
            if 'question' in answer and len(answer) > 1:
                # Return a simple string instead of the whole dict
                return "answer"
            # If it's a simple dict, convert to JSON string
            if len(answer) <= 3:
                try:
                    return json.dumps(answer)
                except:
                    return str(answer)
            # Complex dict - return as JSON string
            try:
                return json.dumps(answer)
            except:
                return str(answer)
        
        # If it's a list, convert to JSON string if small, otherwise string
        if isinstance(answer, list):
            if len(answer) <= 10:
                try:
                    return json.dumps(answer)
                except:
                    return str(answer)
            return str(answer)
        
        # For strings, return as-is (but clean up)
        if isinstance(answer, str):
            # Remove excessive whitespace
            answer = ' '.join(answer.split())
            # If it's very long, truncate
            if len(answer) > 1000:
                answer = answer[:1000] + "..."
            return answer
        
        # For other types, convert to string
        return str(answer)
    
    def _extract_simple_answer(self, question: str, page_content: Dict[str, Any]) -> Optional[str]:
        """

        Try to extract a simple answer from the question or page.

        

        Args:

            question: Question text

            page_content: Page content

            

        Returns:

            Simple answer string or None

        """
        text = page_content.get('all_text', page_content.get('text', ''))
        combined = question + "\n\n" + text
        
        # Check if question says "anything" or similar - very common in demo quizzes
        if re.search(r'"answer"\s*:\s*"anything\s+you\s+want"', combined, re.IGNORECASE):
            return "answer"
        if re.search(r'"answer"\s*:\s*"anything"', combined, re.IGNORECASE):
            return "answer"
        if re.search(r'anything\s+you\s+want|any\s+value|any\s+string|any\s+text|anything', question, re.IGNORECASE):
            return "answer"
        
        # Look for patterns like "answer: X" or "the answer is X"
        patterns = [
            r'"answer"\s*:\s*"([^"]+)"',  # JSON format: "answer": "value"
            r'[Aa]nswer[:\s]+["\']?([^"\'\n]+)["\']?',
            r'[Tt]he\s+[Aa]nswer\s+[Ii]s[:\s]+["\']?([^"\'\n]+)["\']?',
            r'[Yy]our\s+[Aa]nswer[:\s]+["\']?([^"\'\n]+)["\']?',
        ]
        
        for pattern in patterns:
            match = re.search(pattern, combined, re.IGNORECASE)
            if match:
                answer = match.group(1).strip()
                # Skip if it's a placeholder or instruction
                if answer and len(answer) < 200 and answer.lower() not in ['your email', 'your secret', 'anything you want', 'anything']:
                    return answer
        
        return None
    
    async def _solve_with_data(self, question: str, data: Dict[str, Any]) -> Optional[Any]:
        """

        Solve question using processed data.

        

        Args:

            question: Question text

            data: Processed data dictionary

            

        Returns:

            Answer or None

        """
        # Use LLM to solve with data
        prompt = f"""Solve this question using the provided data:



Question: {question}



Data:

{json.dumps(data, indent=2, default=str)}



Provide the answer. If JSON format is required, return valid JSON.

"""
        
        answer = await ask_gpt(prompt, max_tokens=3000)
        if answer:
            json_answer = extract_json_from_text(answer)
            if json_answer:
                return json_answer
            return answer
        
        return None
    
    async def _submit_answer(self, submit_url: str, email: str, secret: str, 

                            quiz_url: str, answer: Any) -> Dict[str, Any]:
        """

        Submit answer to the quiz system.

        

        Args:

            submit_url: URL to submit answer to

            email: User email

            secret: Secret key

            quiz_url: Original quiz URL

            answer: Computed answer

            

        Returns:

            Response from submission endpoint

        """
        # Ensure answer is JSON-serializable
        try:
            # Try to serialize answer to check if it's valid JSON
            json.dumps(answer)
        except (TypeError, ValueError) as e:
            logger.warning(f"Answer is not JSON-serializable, converting to string: {e}")
            # Convert complex objects to string representation
            if isinstance(answer, (dict, list)):
                answer = json.dumps(answer)
            else:
                answer = str(answer)
        
        payload = {
            "email": email,
            "secret": secret,
            "url": quiz_url,
            "answer": answer
        }
        
        try:
            logger.info(f"Submitting answer to: {submit_url}")
            logger.debug(f"Payload: {json.dumps(payload, indent=2, default=str)}")
            
            response = requests.post(
                submit_url,
                json=payload,
                headers={'Content-Type': 'application/json'},
                timeout=60
            )
            
            # Log response details
            logger.info(f"Response status: {response.status_code}")
            logger.debug(f"Response headers: {dict(response.headers)}")
            
            response.raise_for_status()
            
            try:
                result = response.json()
                logger.info(f"Submission successful: {result}")
                return result
            except json.JSONDecodeError:
                logger.warning(f"Response is not JSON, returning text: {response.text[:500]}")
                return {"response": response.text, "status_code": response.status_code}
            
        except requests.exceptions.HTTPError as e:
            logger.error(f"HTTP error submitting answer: {e}")
            if hasattr(e, 'response') and e.response is not None:
                try:
                    error_response = e.response.json()
                    logger.error(f"Error response: {error_response}")
                    return error_response
                except:
                    logger.error(f"Error response text: {e.response.text[:500]}")
                    return {"error": e.response.text, "status_code": e.response.status_code}
            return {"error": str(e)}
        except requests.exceptions.RequestException as e:
            logger.error(f"Error submitting answer: {e}", exc_info=True)
            return {"error": str(e)}


async def solve_quiz(url: str, email: str, secret: str) -> Dict[str, Any]:
    """

    Convenience function to solve a quiz.

    

    Args:

        url: Quiz page URL

        email: User email

        secret: Secret key

        

    Returns:

        Final response from quiz system

    """
    solver = QuizSolver()
    return await solver.solve_quiz(url, email, secret)