import fitz # PyMuPDF import re import httpx import tempfile import os import asyncio CACHE_TTL = 10 * 60 # 10 minutes (in seconds) pdf_cache = {} ACADEMIC_SECTION_PATTERNS = [ {"name": 'Resumen / Abstract', "category": 'front', "pattern": r'\b(resumen|abstract|sumario)\b'}, {"name": 'Palabras Clave / Keywords', "category": 'front', "pattern": r'\b(palabras\s+clave|keywords|key\s+words)\b'}, {"name": 'Introducción / Introduction', "category": 'intro', "pattern": r'\b(\d+\.?\s*introducci[oó]n|\d+\.?\s*introduction|introducci[oó]n|introduction)\b'}, {"name": 'Planteamiento del Problema', "category": 'intro', "pattern": r'\b(planteamiento\s+del\s+problema|problem\s+statement|formulaci[oó]n\s+del\s+problema|definici[oó]n\s+del\s+problema)\b'}, {"name": 'Justificación', "category": 'intro', "pattern": r'\b(justificaci[oó]n|justification|importancia|relevancia|motivation)\b'}, {"name": 'Objetivos', "category": 'intro', "pattern": r'\b(objetivos?\s*(generales?|espec[ií]ficos?)?|objectives?|goals?|aims?)\b'}, {"name": 'Hipótesis', "category": 'intro', "pattern": r'\b(hip[oó]tesis|hypothesis|hypotheses)\b'}, {"name": 'Marco Teórico', "category": 'theory', "pattern": r'\b(marco\s+te[oó]rico|theoretical\s+framework|fundamento\s+te[oó]rico|bases\s+te[oó]ricas|theoretical\s+background|state\s+of\s+the\s+art)\b'}, {"name": 'Antecedentes', "category": 'theory', "pattern": r'\b(antecedentes|background|related\s+work|literature\s+review|revisi[oó]n\s+de\s+literatura|estado\s+del\s+arte|trabajos\s+previos|prior\s+work)\b'}, {"name": 'Bases Conceptuales', "category": 'theory', "pattern": r'\b(bases\s+conceptuales|marco\s+conceptual|conceptual\s+framework|definici[oó]n\s+de\s+t[eé]rminos|glosario)\b'}, {"name": 'Metodología / Methods', "category": 'methods', "pattern": r'\b(\d+\.?\s*metodolog[ií]a|\d+\.?\s*methods?|metodolog[ií]a|methods?|methodology|materiales?\s+y\s+m[eé]todos?|materials?\s+and\s+methods?|procedimiento|approach|proposed\s+method|dise[ñn]o\s+metodol[oó]gico)\b'}, {"name": 'Población y Muestra', "category": 'methods', "pattern": r'\b(poblaci[oó]n\s+y\s+muestra|population\s+and\s+sample|sample\s+size|muestra|participants?|participantes|sujetos)\b'}, {"name": 'Instrumentos', "category": 'methods', "pattern": r'\b(instrumentos?\s+de\s+recolecci[oó]n|instruments?|herramientas|cuestionario|encuesta|survey|data\s+collection)\b'}, {"name": 'Resultados / Results', "category": 'results', "pattern": r'\b(\d+\.?\s*resultados|\d+\.?\s*results|resultados|results|findings|hallazgos)\b'}, {"name": 'Análisis de Datos', "category": 'results', "pattern": r'\b(an[aá]lisis\s+de\s+(datos|resultados)|data\s+analysis|analysis\s+of\s+results|an[aá]lisis\s+estad[ií]stico|statistical\s+analysis)\b'}, {"name": 'Discusión / Discussion', "category": 'results', "pattern": r'\b(\d+\.?\s*discusi[oó]n|\d+\.?\s*discussion|discusi[oó]n|discussion|interpretaci[oó]n)\b'}, {"name": 'Conclusiones / Conclusions', "category": 'conclusion', "pattern": r'\b(\d+\.?\s*conclusi[oó]n|\d+\.?\s*conclusions?|conclusi[oó]n|conclusions?|concluding\s+remarks)\b'}, {"name": 'Recomendaciones', "category": 'conclusion', "pattern": r'\b(recomendaciones|recommendations|sugerencias|suggestions|future\s+work|trabajo\s+futuro|trabajos?\s+futuros?)\b'}, {"name": 'Referencias / References', "category": 'back', "pattern": r'\b(referencias|references|bibliograf[ií]a|bibliography|works\s+cited)\b'} ] STATS_PATTERNS = [ {"name": 'p-value', "pattern": r'p\s*[<>=≤≥]\s*0?\.\d+'}, {"name": 'percentage', "pattern": r'\d+[\.,]\d*\s*%'}, {"name": 'mean_std', "pattern": r'(?:media|mean|promedio|average|M)\s*[=:]\s*\d+[\.,]?\d*'}, {"name": 'correlation', "pattern": r'r\s*[=]\s*[+-]?0?\.\d+'}, {"name": 'chi_square', "pattern": r'(?:chi|χ)[²2]\s*[=()]\s*\d+[\.,]?\d*'}, {"name": 'confidence_interval', "pattern": r'(?:IC|CI)\s*[=:(\[]\s*\d+'}, {"name": 't_test', "pattern": r't\s*[=(]\s*\d+[\.,]?\d*'}, {"name": 'f_test', "pattern": r'F\s*[=(]\s*\d+[\.,]?\d*'}, {"name": 'n_sample', "pattern": r'(?:n|N)\s*[=]\s*\d+'}, {"name": 'alpha', "pattern": r'(?:α|alfa|alpha)\s*[=]\s*0?\.\d+'}, {"name": 'anova', "pattern": r'ANOVA|an[aá]lisis\s+de\s+varianza'} ] async def download_pdf(url: str) -> bytes: """Download PDF verifying MIME type to avoid getting HTML caps""" async with httpx.AsyncClient(verify=False, follow_redirects=True) as client: try: head_req = await client.head(url, timeout=10.0) if 'text/html' in head_req.headers.get('content-type', ''): raise ValueError(f"URL returned HTML instead of PDF: {url}") res = await client.get(url, timeout=30.0) res.raise_for_status() content = res.content if not content.startswith(b'%PDF-'): raise ValueError("Downloaded file is not a valid PDF") return content except Exception as e: raise ValueError(f"Failed to download PDF from {url}: {e}") async def extract_text(pdf_bytes: bytes) -> str: """Extract full text from PDF using PyMuPDF""" try: doc = fitz.open(stream=pdf_bytes, filetype="pdf") text = "" for page in doc: text += page.get_text() + "\n" doc.close() return text except Exception as e: print(f"[PDF_PROCESSOR] Error extracting text: {e}") return "" def classify_document(text: str) -> str: lower_text = text[:5000].lower() thesis_score = len(re.findall(r'tesis|tesina|disertaci[oó]n|para optar|bachiller|licenciatura|maestr[ií]a', lower_text)) article_score = len(re.findall(r'\babstract\b|\bjournal\b|revista|doi:\s*10\.', lower_text)) if thesis_score > article_score and thesis_score >= 2: return 'thesis' if article_score > thesis_score and article_score >= 2: return 'article' return 'unknown' def extract_statistics(text: str) -> list: stats = [] for sp in STATS_PATTERNS: matches = list(set(re.findall(sp["pattern"], text, re.IGNORECASE))) if matches: stats.append({ "type": sp["name"], "matches": matches[:10], "count": len(matches) }) return stats async def analyze_academic_document(url_or_path: str) -> dict: """Download, extract sections, and calculate statistics""" if url_or_path.startswith("http"): pdf_bytes = await download_pdf(url_or_path) else: with open(url_or_path, 'rb') as f: pdf_bytes = f.read() text = await extract_text(pdf_bytes) doc_type = classify_document(text) lines = text.split('\n') section_starts = [] char_offset = 0 for i, line in enumerate(lines): clean_line = line.strip() if 2 < len(clean_line) < 100: for sp in ACADEMIC_SECTION_PATTERNS: if re.search(sp["pattern"], clean_line, re.IGNORECASE): section_starts.append({"name": sp["name"], "category": sp["category"], "lineIdx": i, "charIdx": char_offset}) break char_offset += len(line) + 1 sections = [] for i in range(len(section_starts)): start = section_starts[i] end_idx = section_starts[i+1]["charIdx"] if i+1 < len(section_starts) else len(text) content = text[start["charIdx"]:end_idx].strip() section_text = content[:8000] # Limit to avoid massive text blocks stats = extract_statistics(section_text) sections.append({ "name": start["name"], "category": start["category"], "content": section_text[:5000], "statistics": stats, "hasNumericalData": len(stats) > 0 or bool(re.search(r'\d+[\.,]\d+', section_text)) }) global_stats = extract_statistics(text) return { "documentType": doc_type, "sections": sections, "globalStatistics": global_stats, "summary": { "totalSections": len(sections), "totalStatisticalItems": sum(s["count"] for s in global_stats) } }