letxinet / backend /tools /pdf_processor.py
C2MV's picture
Initial upload for Build Small Hackathon
68fb5e2 verified
Raw
History Blame Contribute Delete
8.34 kB
import fitz # PyMuPDF
import re
import httpx
import tempfile
import os
import asyncio
CACHE_TTL = 10 * 60 # 10 minutes (in seconds)
pdf_cache = {}
ACADEMIC_SECTION_PATTERNS = [
{"name": 'Resumen / Abstract', "category": 'front', "pattern": r'\b(resumen|abstract|sumario)\b'},
{"name": 'Palabras Clave / Keywords', "category": 'front', "pattern": r'\b(palabras\s+clave|keywords|key\s+words)\b'},
{"name": 'Introducci贸n / Introduction', "category": 'intro', "pattern": r'\b(\d+\.?\s*introducci[o贸]n|\d+\.?\s*introduction|introducci[o贸]n|introduction)\b'},
{"name": 'Planteamiento del Problema', "category": 'intro', "pattern": r'\b(planteamiento\s+del\s+problema|problem\s+statement|formulaci[o贸]n\s+del\s+problema|definici[o贸]n\s+del\s+problema)\b'},
{"name": 'Justificaci贸n', "category": 'intro', "pattern": r'\b(justificaci[o贸]n|justification|importancia|relevancia|motivation)\b'},
{"name": 'Objetivos', "category": 'intro', "pattern": r'\b(objetivos?\s*(generales?|espec[i铆]ficos?)?|objectives?|goals?|aims?)\b'},
{"name": 'Hip贸tesis', "category": 'intro', "pattern": r'\b(hip[o贸]tesis|hypothesis|hypotheses)\b'},
{"name": 'Marco Te贸rico', "category": 'theory', "pattern": r'\b(marco\s+te[o贸]rico|theoretical\s+framework|fundamento\s+te[o贸]rico|bases\s+te[o贸]ricas|theoretical\s+background|state\s+of\s+the\s+art)\b'},
{"name": 'Antecedentes', "category": 'theory', "pattern": r'\b(antecedentes|background|related\s+work|literature\s+review|revisi[o贸]n\s+de\s+literatura|estado\s+del\s+arte|trabajos\s+previos|prior\s+work)\b'},
{"name": 'Bases Conceptuales', "category": 'theory', "pattern": r'\b(bases\s+conceptuales|marco\s+conceptual|conceptual\s+framework|definici[o贸]n\s+de\s+t[e茅]rminos|glosario)\b'},
{"name": 'Metodolog铆a / Methods', "category": 'methods', "pattern": r'\b(\d+\.?\s*metodolog[i铆]a|\d+\.?\s*methods?|metodolog[i铆]a|methods?|methodology|materiales?\s+y\s+m[e茅]todos?|materials?\s+and\s+methods?|procedimiento|approach|proposed\s+method|dise[帽n]o\s+metodol[o贸]gico)\b'},
{"name": 'Poblaci贸n y Muestra', "category": 'methods', "pattern": r'\b(poblaci[o贸]n\s+y\s+muestra|population\s+and\s+sample|sample\s+size|muestra|participants?|participantes|sujetos)\b'},
{"name": 'Instrumentos', "category": 'methods', "pattern": r'\b(instrumentos?\s+de\s+recolecci[o贸]n|instruments?|herramientas|cuestionario|encuesta|survey|data\s+collection)\b'},
{"name": 'Resultados / Results', "category": 'results', "pattern": r'\b(\d+\.?\s*resultados|\d+\.?\s*results|resultados|results|findings|hallazgos)\b'},
{"name": 'An谩lisis de Datos', "category": 'results', "pattern": r'\b(an[a谩]lisis\s+de\s+(datos|resultados)|data\s+analysis|analysis\s+of\s+results|an[a谩]lisis\s+estad[i铆]stico|statistical\s+analysis)\b'},
{"name": 'Discusi贸n / Discussion', "category": 'results', "pattern": r'\b(\d+\.?\s*discusi[o贸]n|\d+\.?\s*discussion|discusi[o贸]n|discussion|interpretaci[o贸]n)\b'},
{"name": 'Conclusiones / Conclusions', "category": 'conclusion', "pattern": r'\b(\d+\.?\s*conclusi[o贸]n|\d+\.?\s*conclusions?|conclusi[o贸]n|conclusions?|concluding\s+remarks)\b'},
{"name": 'Recomendaciones', "category": 'conclusion', "pattern": r'\b(recomendaciones|recommendations|sugerencias|suggestions|future\s+work|trabajo\s+futuro|trabajos?\s+futuros?)\b'},
{"name": 'Referencias / References', "category": 'back', "pattern": r'\b(referencias|references|bibliograf[i铆]a|bibliography|works\s+cited)\b'}
]
STATS_PATTERNS = [
{"name": 'p-value', "pattern": r'p\s*[<>=鈮も墺]\s*0?\.\d+'},
{"name": 'percentage', "pattern": r'\d+[\.,]\d*\s*%'},
{"name": 'mean_std', "pattern": r'(?:media|mean|promedio|average|M)\s*[=:]\s*\d+[\.,]?\d*'},
{"name": 'correlation', "pattern": r'r\s*[=]\s*[+-]?0?\.\d+'},
{"name": 'chi_square', "pattern": r'(?:chi|蠂)[虏2]\s*[=()]\s*\d+[\.,]?\d*'},
{"name": 'confidence_interval', "pattern": r'(?:IC|CI)\s*[=:(\[]\s*\d+'},
{"name": 't_test', "pattern": r't\s*[=(]\s*\d+[\.,]?\d*'},
{"name": 'f_test', "pattern": r'F\s*[=(]\s*\d+[\.,]?\d*'},
{"name": 'n_sample', "pattern": r'(?:n|N)\s*[=]\s*\d+'},
{"name": 'alpha', "pattern": r'(?:伪|alfa|alpha)\s*[=]\s*0?\.\d+'},
{"name": 'anova', "pattern": r'ANOVA|an[a谩]lisis\s+de\s+varianza'}
]
async def download_pdf(url: str) -> bytes:
"""Download PDF verifying MIME type to avoid getting HTML caps"""
async with httpx.AsyncClient(verify=False, follow_redirects=True) as client:
try:
head_req = await client.head(url, timeout=10.0)
if 'text/html' in head_req.headers.get('content-type', ''):
raise ValueError(f"URL returned HTML instead of PDF: {url}")
res = await client.get(url, timeout=30.0)
res.raise_for_status()
content = res.content
if not content.startswith(b'%PDF-'):
raise ValueError("Downloaded file is not a valid PDF")
return content
except Exception as e:
raise ValueError(f"Failed to download PDF from {url}: {e}")
async def extract_text(pdf_bytes: bytes) -> str:
"""Extract full text from PDF using PyMuPDF"""
try:
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
text = ""
for page in doc:
text += page.get_text() + "\n"
doc.close()
return text
except Exception as e:
print(f"[PDF_PROCESSOR] Error extracting text: {e}")
return ""
def classify_document(text: str) -> str:
lower_text = text[:5000].lower()
thesis_score = len(re.findall(r'tesis|tesina|disertaci[o贸]n|para optar|bachiller|licenciatura|maestr[i铆]a', lower_text))
article_score = len(re.findall(r'\babstract\b|\bjournal\b|revista|doi:\s*10\.', lower_text))
if thesis_score > article_score and thesis_score >= 2:
return 'thesis'
if article_score > thesis_score and article_score >= 2:
return 'article'
return 'unknown'
def extract_statistics(text: str) -> list:
stats = []
for sp in STATS_PATTERNS:
matches = list(set(re.findall(sp["pattern"], text, re.IGNORECASE)))
if matches:
stats.append({
"type": sp["name"],
"matches": matches[:10],
"count": len(matches)
})
return stats
async def analyze_academic_document(url_or_path: str) -> dict:
"""Download, extract sections, and calculate statistics"""
if url_or_path.startswith("http"):
pdf_bytes = await download_pdf(url_or_path)
else:
with open(url_or_path, 'rb') as f:
pdf_bytes = f.read()
text = await extract_text(pdf_bytes)
doc_type = classify_document(text)
lines = text.split('\n')
section_starts = []
char_offset = 0
for i, line in enumerate(lines):
clean_line = line.strip()
if 2 < len(clean_line) < 100:
for sp in ACADEMIC_SECTION_PATTERNS:
if re.search(sp["pattern"], clean_line, re.IGNORECASE):
section_starts.append({"name": sp["name"], "category": sp["category"], "lineIdx": i, "charIdx": char_offset})
break
char_offset += len(line) + 1
sections = []
for i in range(len(section_starts)):
start = section_starts[i]
end_idx = section_starts[i+1]["charIdx"] if i+1 < len(section_starts) else len(text)
content = text[start["charIdx"]:end_idx].strip()
section_text = content[:8000] # Limit to avoid massive text blocks
stats = extract_statistics(section_text)
sections.append({
"name": start["name"],
"category": start["category"],
"content": section_text[:5000],
"statistics": stats,
"hasNumericalData": len(stats) > 0 or bool(re.search(r'\d+[\.,]\d+', section_text))
})
global_stats = extract_statistics(text)
return {
"documentType": doc_type,
"sections": sections,
"globalStatistics": global_stats,
"summary": {
"totalSections": len(sections),
"totalStatisticalItems": sum(s["count"] for s in global_stats)
}
}