Spaces:

C2MV
/

letxinet

Runtime error

App Files Files Community

letxinet / backend /tools /pdf_processor.py

C2MV

Initial upload for Build Small Hackathon

68fb5e2 verified 14 days ago

Raw

History Blame Contribute Delete

8.34 kB

	import fitz # PyMuPDF
	import re
	import httpx
	import tempfile
	import os
	import asyncio

	CACHE_TTL = 10 * 60 # 10 minutes (in seconds)
	pdf_cache = {}

	ACADEMIC_SECTION_PATTERNS = [
	{"name": 'Resumen / Abstract', "category": 'front', "pattern": r'\b(resumen\|abstract\|sumario)\b'},
	{"name": 'Palabras Clave / Keywords', "category": 'front', "pattern": r'\b(palabras\s+clave\|keywords\|key\s+words)\b'},
	{"name": 'Introducción / Introduction', "category": 'intro', "pattern": r'\b(\d+\.?\sintroducci[oó]n\|\d+\.?\sintroduction\|introducci[oó]n\|introduction)\b'},
	{"name": 'Planteamiento del Problema', "category": 'intro', "pattern": r'\b(planteamiento\s+del\s+problema\|problem\s+statement\|formulaci[oó]n\s+del\s+problema\|definici[oó]n\s+del\s+problema)\b'},
	{"name": 'Justificación', "category": 'intro', "pattern": r'\b(justificaci[oó]n\|justification\|importancia\|relevancia\|motivation)\b'},
	{"name": 'Objetivos', "category": 'intro', "pattern": r'\b(objetivos?\s*(generales?\|espec[ií]ficos?)?\|objectives?\|goals?\|aims?)\b'},
	{"name": 'Hipótesis', "category": 'intro', "pattern": r'\b(hip[oó]tesis\|hypothesis\|hypotheses)\b'},
	{"name": 'Marco Teórico', "category": 'theory', "pattern": r'\b(marco\s+te[oó]rico\|theoretical\s+framework\|fundamento\s+te[oó]rico\|bases\s+te[oó]ricas\|theoretical\s+background\|state\s+of\s+the\s+art)\b'},
	{"name": 'Antecedentes', "category": 'theory', "pattern": r'\b(antecedentes\|background\|related\s+work\|literature\s+review\|revisi[oó]n\s+de\s+literatura\|estado\s+del\s+arte\|trabajos\s+previos\|prior\s+work)\b'},
	{"name": 'Bases Conceptuales', "category": 'theory', "pattern": r'\b(bases\s+conceptuales\|marco\s+conceptual\|conceptual\s+framework\|definici[oó]n\s+de\s+t[eé]rminos\|glosario)\b'},
	{"name": 'Metodología / Methods', "category": 'methods', "pattern": r'\b(\d+\.?\smetodolog[ií]a\|\d+\.?\smethods?\|metodolog[ií]a\|methods?\|methodology\|materiales?\s+y\s+m[eé]todos?\|materials?\s+and\s+methods?\|procedimiento\|approach\|proposed\s+method\|dise[ñn]o\s+metodol[oó]gico)\b'},
	{"name": 'Población y Muestra', "category": 'methods', "pattern": r'\b(poblaci[oó]n\s+y\s+muestra\|population\s+and\s+sample\|sample\s+size\|muestra\|participants?\|participantes\|sujetos)\b'},
	{"name": 'Instrumentos', "category": 'methods', "pattern": r'\b(instrumentos?\s+de\s+recolecci[oó]n\|instruments?\|herramientas\|cuestionario\|encuesta\|survey\|data\s+collection)\b'},
	{"name": 'Resultados / Results', "category": 'results', "pattern": r'\b(\d+\.?\sresultados\|\d+\.?\sresults\|resultados\|results\|findings\|hallazgos)\b'},
	{"name": 'Análisis de Datos', "category": 'results', "pattern": r'\b(an[aá]lisis\s+de\s+(datos\|resultados)\|data\s+analysis\|analysis\s+of\s+results\|an[aá]lisis\s+estad[ií]stico\|statistical\s+analysis)\b'},
	{"name": 'Discusión / Discussion', "category": 'results', "pattern": r'\b(\d+\.?\sdiscusi[oó]n\|\d+\.?\sdiscussion\|discusi[oó]n\|discussion\|interpretaci[oó]n)\b'},
	{"name": 'Conclusiones / Conclusions', "category": 'conclusion', "pattern": r'\b(\d+\.?\sconclusi[oó]n\|\d+\.?\sconclusions?\|conclusi[oó]n\|conclusions?\|concluding\s+remarks)\b'},
	{"name": 'Recomendaciones', "category": 'conclusion', "pattern": r'\b(recomendaciones\|recommendations\|sugerencias\|suggestions\|future\s+work\|trabajo\s+futuro\|trabajos?\s+futuros?)\b'},
	{"name": 'Referencias / References', "category": 'back', "pattern": r'\b(referencias\|references\|bibliograf[ií]a\|bibliography\|works\s+cited)\b'}
	]

	STATS_PATTERNS = [
	{"name": 'p-value', "pattern": r'p\s[<>=≤≥]\s0?\.\d+'},
	{"name": 'percentage', "pattern": r'\d+[\.,]\d\s%'},
	{"name": 'mean_std', "pattern": r'(?:media\|mean\|promedio\|average\|M)\s[=:]\s\d+[\.,]?\d*'},
	{"name": 'correlation', "pattern": r'r\s[=]\s[+-]?0?\.\d+'},
	{"name": 'chi_square', "pattern": r'(?:chi\|χ)[²2]\s[=()]\s\d+[\.,]?\d*'},
	{"name": 'confidence_interval', "pattern": r'(?:IC\|CI)\s[=:(\[]\s\d+'},
	{"name": 't_test', "pattern": r't\s[=(]\s\d+[\.,]?\d*'},
	{"name": 'f_test', "pattern": r'F\s[=(]\s\d+[\.,]?\d*'},
	{"name": 'n_sample', "pattern": r'(?:n\|N)\s[=]\s\d+'},
	{"name": 'alpha', "pattern": r'(?:α\|alfa\|alpha)\s[=]\s0?\.\d+'},
	{"name": 'anova', "pattern": r'ANOVA\|an[aá]lisis\s+de\s+varianza'}
	]

	async def download_pdf(url: str) -> bytes:
	"""Download PDF verifying MIME type to avoid getting HTML caps"""
	async with httpx.AsyncClient(verify=False, follow_redirects=True) as client:
	try:
	head_req = await client.head(url, timeout=10.0)
	if 'text/html' in head_req.headers.get('content-type', ''):
	raise ValueError(f"URL returned HTML instead of PDF: {url}")

	res = await client.get(url, timeout=30.0)
	res.raise_for_status()

	content = res.content
	if not content.startswith(b'%PDF-'):
	raise ValueError("Downloaded file is not a valid PDF")

	return content
	except Exception as e:
	raise ValueError(f"Failed to download PDF from {url}: {e}")

	async def extract_text(pdf_bytes: bytes) -> str:
	"""Extract full text from PDF using PyMuPDF"""
	try:
	doc = fitz.open(stream=pdf_bytes, filetype="pdf")
	text = ""
	for page in doc:
	text += page.get_text() + "\n"
	doc.close()
	return text
	except Exception as e:
	print(f"[PDF_PROCESSOR] Error extracting text: {e}")
	return ""

	def classify_document(text: str) -> str:
	lower_text = text[:5000].lower()

	thesis_score = len(re.findall(r'tesis\|tesina\|disertaci[oó]n\|para optar\|bachiller\|licenciatura\|maestr[ií]a', lower_text))
	article_score = len(re.findall(r'\babstract\b\|\bjournal\b\|revista\|doi:\s*10\.', lower_text))

	if thesis_score > article_score and thesis_score >= 2:
	return 'thesis'
	if article_score > thesis_score and article_score >= 2:
	return 'article'
	return 'unknown'

	def extract_statistics(text: str) -> list:
	stats = []
	for sp in STATS_PATTERNS:
	matches = list(set(re.findall(sp["pattern"], text, re.IGNORECASE)))
	if matches:
	stats.append({
	"type": sp["name"],
	"matches": matches[:10],
	"count": len(matches)
	})
	return stats

	async def analyze_academic_document(url_or_path: str) -> dict:
	"""Download, extract sections, and calculate statistics"""
	if url_or_path.startswith("http"):
	pdf_bytes = await download_pdf(url_or_path)
	else:
	with open(url_or_path, 'rb') as f:
	pdf_bytes = f.read()

	text = await extract_text(pdf_bytes)
	doc_type = classify_document(text)

	lines = text.split('\n')
	section_starts = []

	char_offset = 0
	for i, line in enumerate(lines):
	clean_line = line.strip()
	if 2 < len(clean_line) < 100:
	for sp in ACADEMIC_SECTION_PATTERNS:
	if re.search(sp["pattern"], clean_line, re.IGNORECASE):
	section_starts.append({"name": sp["name"], "category": sp["category"], "lineIdx": i, "charIdx": char_offset})
	break
	char_offset += len(line) + 1

	sections = []
	for i in range(len(section_starts)):
	start = section_starts[i]
	end_idx = section_starts[i+1]["charIdx"] if i+1 < len(section_starts) else len(text)

	content = text[start["charIdx"]:end_idx].strip()
	section_text = content[:8000] # Limit to avoid massive text blocks

	stats = extract_statistics(section_text)

	sections.append({
	"name": start["name"],
	"category": start["category"],
	"content": section_text[:5000],
	"statistics": stats,
	"hasNumericalData": len(stats) > 0 or bool(re.search(r'\d+[\.,]\d+', section_text))
	})

	global_stats = extract_statistics(text)

	return {
	"documentType": doc_type,
	"sections": sections,
	"globalStatistics": global_stats,
	"summary": {
	"totalSections": len(sections),
	"totalStatisticalItems": sum(s["count"] for s in global_stats)
	}
	}