| import fitz |
| import re |
| import httpx |
| import tempfile |
| import os |
| import asyncio |
|
|
| CACHE_TTL = 10 * 60 |
| pdf_cache = {} |
|
|
| ACADEMIC_SECTION_PATTERNS = [ |
| {"name": 'Resumen / Abstract', "category": 'front', "pattern": r'\b(resumen|abstract|sumario)\b'}, |
| {"name": 'Palabras Clave / Keywords', "category": 'front', "pattern": r'\b(palabras\s+clave|keywords|key\s+words)\b'}, |
| {"name": 'Introducci贸n / Introduction', "category": 'intro', "pattern": r'\b(\d+\.?\s*introducci[o贸]n|\d+\.?\s*introduction|introducci[o贸]n|introduction)\b'}, |
| {"name": 'Planteamiento del Problema', "category": 'intro', "pattern": r'\b(planteamiento\s+del\s+problema|problem\s+statement|formulaci[o贸]n\s+del\s+problema|definici[o贸]n\s+del\s+problema)\b'}, |
| {"name": 'Justificaci贸n', "category": 'intro', "pattern": r'\b(justificaci[o贸]n|justification|importancia|relevancia|motivation)\b'}, |
| {"name": 'Objetivos', "category": 'intro', "pattern": r'\b(objetivos?\s*(generales?|espec[i铆]ficos?)?|objectives?|goals?|aims?)\b'}, |
| {"name": 'Hip贸tesis', "category": 'intro', "pattern": r'\b(hip[o贸]tesis|hypothesis|hypotheses)\b'}, |
| {"name": 'Marco Te贸rico', "category": 'theory', "pattern": r'\b(marco\s+te[o贸]rico|theoretical\s+framework|fundamento\s+te[o贸]rico|bases\s+te[o贸]ricas|theoretical\s+background|state\s+of\s+the\s+art)\b'}, |
| {"name": 'Antecedentes', "category": 'theory', "pattern": r'\b(antecedentes|background|related\s+work|literature\s+review|revisi[o贸]n\s+de\s+literatura|estado\s+del\s+arte|trabajos\s+previos|prior\s+work)\b'}, |
| {"name": 'Bases Conceptuales', "category": 'theory', "pattern": r'\b(bases\s+conceptuales|marco\s+conceptual|conceptual\s+framework|definici[o贸]n\s+de\s+t[e茅]rminos|glosario)\b'}, |
| {"name": 'Metodolog铆a / Methods', "category": 'methods', "pattern": r'\b(\d+\.?\s*metodolog[i铆]a|\d+\.?\s*methods?|metodolog[i铆]a|methods?|methodology|materiales?\s+y\s+m[e茅]todos?|materials?\s+and\s+methods?|procedimiento|approach|proposed\s+method|dise[帽n]o\s+metodol[o贸]gico)\b'}, |
| {"name": 'Poblaci贸n y Muestra', "category": 'methods', "pattern": r'\b(poblaci[o贸]n\s+y\s+muestra|population\s+and\s+sample|sample\s+size|muestra|participants?|participantes|sujetos)\b'}, |
| {"name": 'Instrumentos', "category": 'methods', "pattern": r'\b(instrumentos?\s+de\s+recolecci[o贸]n|instruments?|herramientas|cuestionario|encuesta|survey|data\s+collection)\b'}, |
| {"name": 'Resultados / Results', "category": 'results', "pattern": r'\b(\d+\.?\s*resultados|\d+\.?\s*results|resultados|results|findings|hallazgos)\b'}, |
| {"name": 'An谩lisis de Datos', "category": 'results', "pattern": r'\b(an[a谩]lisis\s+de\s+(datos|resultados)|data\s+analysis|analysis\s+of\s+results|an[a谩]lisis\s+estad[i铆]stico|statistical\s+analysis)\b'}, |
| {"name": 'Discusi贸n / Discussion', "category": 'results', "pattern": r'\b(\d+\.?\s*discusi[o贸]n|\d+\.?\s*discussion|discusi[o贸]n|discussion|interpretaci[o贸]n)\b'}, |
| {"name": 'Conclusiones / Conclusions', "category": 'conclusion', "pattern": r'\b(\d+\.?\s*conclusi[o贸]n|\d+\.?\s*conclusions?|conclusi[o贸]n|conclusions?|concluding\s+remarks)\b'}, |
| {"name": 'Recomendaciones', "category": 'conclusion', "pattern": r'\b(recomendaciones|recommendations|sugerencias|suggestions|future\s+work|trabajo\s+futuro|trabajos?\s+futuros?)\b'}, |
| {"name": 'Referencias / References', "category": 'back', "pattern": r'\b(referencias|references|bibliograf[i铆]a|bibliography|works\s+cited)\b'} |
| ] |
|
|
| STATS_PATTERNS = [ |
| {"name": 'p-value', "pattern": r'p\s*[<>=鈮も墺]\s*0?\.\d+'}, |
| {"name": 'percentage', "pattern": r'\d+[\.,]\d*\s*%'}, |
| {"name": 'mean_std', "pattern": r'(?:media|mean|promedio|average|M)\s*[=:]\s*\d+[\.,]?\d*'}, |
| {"name": 'correlation', "pattern": r'r\s*[=]\s*[+-]?0?\.\d+'}, |
| {"name": 'chi_square', "pattern": r'(?:chi|蠂)[虏2]\s*[=()]\s*\d+[\.,]?\d*'}, |
| {"name": 'confidence_interval', "pattern": r'(?:IC|CI)\s*[=:(\[]\s*\d+'}, |
| {"name": 't_test', "pattern": r't\s*[=(]\s*\d+[\.,]?\d*'}, |
| {"name": 'f_test', "pattern": r'F\s*[=(]\s*\d+[\.,]?\d*'}, |
| {"name": 'n_sample', "pattern": r'(?:n|N)\s*[=]\s*\d+'}, |
| {"name": 'alpha', "pattern": r'(?:伪|alfa|alpha)\s*[=]\s*0?\.\d+'}, |
| {"name": 'anova', "pattern": r'ANOVA|an[a谩]lisis\s+de\s+varianza'} |
| ] |
|
|
| async def download_pdf(url: str) -> bytes: |
| """Download PDF verifying MIME type to avoid getting HTML caps""" |
| async with httpx.AsyncClient(verify=False, follow_redirects=True) as client: |
| try: |
| head_req = await client.head(url, timeout=10.0) |
| if 'text/html' in head_req.headers.get('content-type', ''): |
| raise ValueError(f"URL returned HTML instead of PDF: {url}") |
| |
| res = await client.get(url, timeout=30.0) |
| res.raise_for_status() |
| |
| content = res.content |
| if not content.startswith(b'%PDF-'): |
| raise ValueError("Downloaded file is not a valid PDF") |
| |
| return content |
| except Exception as e: |
| raise ValueError(f"Failed to download PDF from {url}: {e}") |
|
|
| async def extract_text(pdf_bytes: bytes) -> str: |
| """Extract full text from PDF using PyMuPDF""" |
| try: |
| doc = fitz.open(stream=pdf_bytes, filetype="pdf") |
| text = "" |
| for page in doc: |
| text += page.get_text() + "\n" |
| doc.close() |
| return text |
| except Exception as e: |
| print(f"[PDF_PROCESSOR] Error extracting text: {e}") |
| return "" |
|
|
| def classify_document(text: str) -> str: |
| lower_text = text[:5000].lower() |
| |
| thesis_score = len(re.findall(r'tesis|tesina|disertaci[o贸]n|para optar|bachiller|licenciatura|maestr[i铆]a', lower_text)) |
| article_score = len(re.findall(r'\babstract\b|\bjournal\b|revista|doi:\s*10\.', lower_text)) |
| |
| if thesis_score > article_score and thesis_score >= 2: |
| return 'thesis' |
| if article_score > thesis_score and article_score >= 2: |
| return 'article' |
| return 'unknown' |
|
|
| def extract_statistics(text: str) -> list: |
| stats = [] |
| for sp in STATS_PATTERNS: |
| matches = list(set(re.findall(sp["pattern"], text, re.IGNORECASE))) |
| if matches: |
| stats.append({ |
| "type": sp["name"], |
| "matches": matches[:10], |
| "count": len(matches) |
| }) |
| return stats |
|
|
| async def analyze_academic_document(url_or_path: str) -> dict: |
| """Download, extract sections, and calculate statistics""" |
| if url_or_path.startswith("http"): |
| pdf_bytes = await download_pdf(url_or_path) |
| else: |
| with open(url_or_path, 'rb') as f: |
| pdf_bytes = f.read() |
| |
| text = await extract_text(pdf_bytes) |
| doc_type = classify_document(text) |
| |
| lines = text.split('\n') |
| section_starts = [] |
| |
| char_offset = 0 |
| for i, line in enumerate(lines): |
| clean_line = line.strip() |
| if 2 < len(clean_line) < 100: |
| for sp in ACADEMIC_SECTION_PATTERNS: |
| if re.search(sp["pattern"], clean_line, re.IGNORECASE): |
| section_starts.append({"name": sp["name"], "category": sp["category"], "lineIdx": i, "charIdx": char_offset}) |
| break |
| char_offset += len(line) + 1 |
| |
| sections = [] |
| for i in range(len(section_starts)): |
| start = section_starts[i] |
| end_idx = section_starts[i+1]["charIdx"] if i+1 < len(section_starts) else len(text) |
| |
| content = text[start["charIdx"]:end_idx].strip() |
| section_text = content[:8000] |
| |
| stats = extract_statistics(section_text) |
| |
| sections.append({ |
| "name": start["name"], |
| "category": start["category"], |
| "content": section_text[:5000], |
| "statistics": stats, |
| "hasNumericalData": len(stats) > 0 or bool(re.search(r'\d+[\.,]\d+', section_text)) |
| }) |
| |
| global_stats = extract_statistics(text) |
| |
| return { |
| "documentType": doc_type, |
| "sections": sections, |
| "globalStatistics": global_stats, |
| "summary": { |
| "totalSections": len(sections), |
| "totalStatisticalItems": sum(s["count"] for s in global_stats) |
| } |
| } |
|
|