Spaces:

C2MV
/

letxinet

Runtime error

File size: 8,335 Bytes

68fb5e2

import fitz  # PyMuPDF
import re
import httpx
import tempfile
import os
import asyncio

CACHE_TTL = 10 * 60  # 10 minutes (in seconds)
pdf_cache = {}

ACADEMIC_SECTION_PATTERNS = [
    {"name": 'Resumen / Abstract', "category": 'front', "pattern": r'\b(resumen|abstract|sumario)\b'},
    {"name": 'Palabras Clave / Keywords', "category": 'front', "pattern": r'\b(palabras\s+clave|keywords|key\s+words)\b'},
    {"name": 'Introducción / Introduction', "category": 'intro', "pattern": r'\b(\d+\.?\s*introducci[oó]n|\d+\.?\s*introduction|introducci[oó]n|introduction)\b'},
    {"name": 'Planteamiento del Problema', "category": 'intro', "pattern": r'\b(planteamiento\s+del\s+problema|problem\s+statement|formulaci[oó]n\s+del\s+problema|definici[oó]n\s+del\s+problema)\b'},
    {"name": 'Justificación', "category": 'intro', "pattern": r'\b(justificaci[oó]n|justification|importancia|relevancia|motivation)\b'},
    {"name": 'Objetivos', "category": 'intro', "pattern": r'\b(objetivos?\s*(generales?|espec[ií]ficos?)?|objectives?|goals?|aims?)\b'},
    {"name": 'Hipótesis', "category": 'intro', "pattern": r'\b(hip[oó]tesis|hypothesis|hypotheses)\b'},
    {"name": 'Marco Teórico', "category": 'theory', "pattern": r'\b(marco\s+te[oó]rico|theoretical\s+framework|fundamento\s+te[oó]rico|bases\s+te[oó]ricas|theoretical\s+background|state\s+of\s+the\s+art)\b'},
    {"name": 'Antecedentes', "category": 'theory', "pattern": r'\b(antecedentes|background|related\s+work|literature\s+review|revisi[oó]n\s+de\s+literatura|estado\s+del\s+arte|trabajos\s+previos|prior\s+work)\b'},
    {"name": 'Bases Conceptuales', "category": 'theory', "pattern": r'\b(bases\s+conceptuales|marco\s+conceptual|conceptual\s+framework|definici[oó]n\s+de\s+t[eé]rminos|glosario)\b'},
    {"name": 'Metodología / Methods', "category": 'methods', "pattern": r'\b(\d+\.?\s*metodolog[ií]a|\d+\.?\s*methods?|metodolog[ií]a|methods?|methodology|materiales?\s+y\s+m[eé]todos?|materials?\s+and\s+methods?|procedimiento|approach|proposed\s+method|dise[ñn]o\s+metodol[oó]gico)\b'},
    {"name": 'Población y Muestra', "category": 'methods', "pattern": r'\b(poblaci[oó]n\s+y\s+muestra|population\s+and\s+sample|sample\s+size|muestra|participants?|participantes|sujetos)\b'},
    {"name": 'Instrumentos', "category": 'methods', "pattern": r'\b(instrumentos?\s+de\s+recolecci[oó]n|instruments?|herramientas|cuestionario|encuesta|survey|data\s+collection)\b'},
    {"name": 'Resultados / Results', "category": 'results', "pattern": r'\b(\d+\.?\s*resultados|\d+\.?\s*results|resultados|results|findings|hallazgos)\b'},
    {"name": 'Análisis de Datos', "category": 'results', "pattern": r'\b(an[aá]lisis\s+de\s+(datos|resultados)|data\s+analysis|analysis\s+of\s+results|an[aá]lisis\s+estad[ií]stico|statistical\s+analysis)\b'},
    {"name": 'Discusión / Discussion', "category": 'results', "pattern": r'\b(\d+\.?\s*discusi[oó]n|\d+\.?\s*discussion|discusi[oó]n|discussion|interpretaci[oó]n)\b'},
    {"name": 'Conclusiones / Conclusions', "category": 'conclusion', "pattern": r'\b(\d+\.?\s*conclusi[oó]n|\d+\.?\s*conclusions?|conclusi[oó]n|conclusions?|concluding\s+remarks)\b'},
    {"name": 'Recomendaciones', "category": 'conclusion', "pattern": r'\b(recomendaciones|recommendations|sugerencias|suggestions|future\s+work|trabajo\s+futuro|trabajos?\s+futuros?)\b'},
    {"name": 'Referencias / References', "category": 'back', "pattern": r'\b(referencias|references|bibliograf[ií]a|bibliography|works\s+cited)\b'}
]

STATS_PATTERNS = [
    {"name": 'p-value', "pattern": r'p\s*[<>=≤≥]\s*0?\.\d+'},
    {"name": 'percentage', "pattern": r'\d+[\.,]\d*\s*%'},
    {"name": 'mean_std', "pattern": r'(?:media|mean|promedio|average|M)\s*[=:]\s*\d+[\.,]?\d*'},
    {"name": 'correlation', "pattern": r'r\s*[=]\s*[+-]?0?\.\d+'},
    {"name": 'chi_square', "pattern": r'(?:chi|χ)[²2]\s*[=()]\s*\d+[\.,]?\d*'},
    {"name": 'confidence_interval', "pattern": r'(?:IC|CI)\s*[=:(\[]\s*\d+'},
    {"name": 't_test', "pattern": r't\s*[=(]\s*\d+[\.,]?\d*'},
    {"name": 'f_test', "pattern": r'F\s*[=(]\s*\d+[\.,]?\d*'},
    {"name": 'n_sample', "pattern": r'(?:n|N)\s*[=]\s*\d+'},
    {"name": 'alpha', "pattern": r'(?:α|alfa|alpha)\s*[=]\s*0?\.\d+'},
    {"name": 'anova', "pattern": r'ANOVA|an[aá]lisis\s+de\s+varianza'}
]

async def download_pdf(url: str) -> bytes:
    """Download PDF verifying MIME type to avoid getting HTML caps"""
    async with httpx.AsyncClient(verify=False, follow_redirects=True) as client:
        try:
            head_req = await client.head(url, timeout=10.0)
            if 'text/html' in head_req.headers.get('content-type', ''):
                raise ValueError(f"URL returned HTML instead of PDF: {url}")
            
            res = await client.get(url, timeout=30.0)
            res.raise_for_status()
            
            content = res.content
            if not content.startswith(b'%PDF-'):
                raise ValueError("Downloaded file is not a valid PDF")
                
            return content
        except Exception as e:
            raise ValueError(f"Failed to download PDF from {url}: {e}")

async def extract_text(pdf_bytes: bytes) -> str:
    """Extract full text from PDF using PyMuPDF"""
    try:
        doc = fitz.open(stream=pdf_bytes, filetype="pdf")
        text = ""
        for page in doc:
            text += page.get_text() + "\n"
        doc.close()
        return text
    except Exception as e:
        print(f"[PDF_PROCESSOR] Error extracting text: {e}")
        return ""

def classify_document(text: str) -> str:
    lower_text = text[:5000].lower()
    
    thesis_score = len(re.findall(r'tesis|tesina|disertaci[oó]n|para optar|bachiller|licenciatura|maestr[ií]a', lower_text))
    article_score = len(re.findall(r'\babstract\b|\bjournal\b|revista|doi:\s*10\.', lower_text))
    
    if thesis_score > article_score and thesis_score >= 2:
        return 'thesis'
    if article_score > thesis_score and article_score >= 2:
        return 'article'
    return 'unknown'

def extract_statistics(text: str) -> list:
    stats = []
    for sp in STATS_PATTERNS:
        matches = list(set(re.findall(sp["pattern"], text, re.IGNORECASE)))
        if matches:
            stats.append({
                "type": sp["name"],
                "matches": matches[:10],
                "count": len(matches)
            })
    return stats

async def analyze_academic_document(url_or_path: str) -> dict:
    """Download, extract sections, and calculate statistics"""
    if url_or_path.startswith("http"):
        pdf_bytes = await download_pdf(url_or_path)
    else:
        with open(url_or_path, 'rb') as f:
            pdf_bytes = f.read()
            
    text = await extract_text(pdf_bytes)
    doc_type = classify_document(text)
    
    lines = text.split('\n')
    section_starts = []
    
    char_offset = 0
    for i, line in enumerate(lines):
        clean_line = line.strip()
        if 2 < len(clean_line) < 100:
            for sp in ACADEMIC_SECTION_PATTERNS:
                if re.search(sp["pattern"], clean_line, re.IGNORECASE):
                    section_starts.append({"name": sp["name"], "category": sp["category"], "lineIdx": i, "charIdx": char_offset})
                    break
        char_offset += len(line) + 1
        
    sections = []
    for i in range(len(section_starts)):
        start = section_starts[i]
        end_idx = section_starts[i+1]["charIdx"] if i+1 < len(section_starts) else len(text)
        
        content = text[start["charIdx"]:end_idx].strip()
        section_text = content[:8000] # Limit to avoid massive text blocks
        
        stats = extract_statistics(section_text)
        
        sections.append({
            "name": start["name"],
            "category": start["category"],
            "content": section_text[:5000],
            "statistics": stats,
            "hasNumericalData": len(stats) > 0 or bool(re.search(r'\d+[\.,]\d+', section_text))
        })
        
    global_stats = extract_statistics(text)
    
    return {
        "documentType": doc_type,
        "sections": sections,
        "globalStatistics": global_stats,
        "summary": {
            "totalSections": len(sections),
            "totalStatisticalItems": sum(s["count"] for s in global_stats)
        }
    }