Buckets:
| import fitz # PyMuPDF | |
| import json | |
| import os | |
| import re | |
| from typing import List, Dict, Optional | |
| class OncoRAGIngestor: | |
| """ | |
| Ingestor para guías clínicas oncológicas (NCCN/ESMO). | |
| Implementa Adaptive Semantic Chunking basado en encabezados médicos en inglés. | |
| """ | |
| def __init__(self, output_dir: str = "processed_data"): | |
| self.output_dir = output_dir | |
| if not os.path.exists(output_dir): | |
| os.makedirs(output_dir) | |
| # Palabras clave para identificación de secciones semánticas en inglés (Guías reales) | |
| self.headers_keywords = [ | |
| r"Recommendation", | |
| r"Recommendations", | |
| r"Evidence", | |
| r"Algorithm", | |
| r"Discussion", | |
| r"Treatment", | |
| r"Diagnosis", | |
| r"Workup", | |
| r"Staging", | |
| r"Follow-Up", | |
| r"Principles", | |
| r"Pathology", | |
| r"Systemic Therapy" | |
| ] | |
| self.header_pattern = re.compile(f"^(?:{'|'.join(self.headers_keywords)}).*", re.IGNORECASE) | |
| # Patrones para sanitización de textos (Eliminar rastro de NCCN) | |
| self.nccn_patterns = [ | |
| re.compile(r"National Comprehensive Cancer Network", re.IGNORECASE), | |
| re.compile(r"NCCN Guidelines", re.IGNORECASE), | |
| re.compile(r"NCCN\.org", re.IGNORECASE), | |
| re.compile(r"\bNCCN\b", re.IGNORECASE) | |
| ] | |
| def sanitize_text(self, text: str) -> str: | |
| """Reemplaza rastros de la marca original por términos genéricos de guías oncológicas.""" | |
| sanitized = text | |
| for pattern in self.nccn_patterns: | |
| sanitized = pattern.sub("Oncology Guidelines", sanitized) | |
| return sanitized | |
| def extract_text_semantically(self, pdf_path: str) -> List[Dict[str, str]]: | |
| """ | |
| Extrae texto del PDF nativo respetando el orden visual con PyMuPDF | |
| y lo divide en chunks semánticos basados en encabezados médicos. | |
| """ | |
| doc = fitz.open(pdf_path) | |
| chunks = [] | |
| current_header = "Introduction / General" | |
| current_content = [] | |
| for page_num in range(len(doc)): | |
| page = doc.load_page(page_num) | |
| text_instances = page.get_text("blocks") # Extrae respetando el orden lógico de lectura (x0, y0, x1, y1, text, block_no, block_type) | |
| for block in text_instances: | |
| text = block[4].strip() | |
| if not text: | |
| continue | |
| # Sanitizar el texto | |
| text = self.sanitize_text(text) | |
| # Detectar si el bloque parece ser un encabezado médico relevante | |
| if self.header_pattern.match(text) and len(text) < 120: | |
| # Guardar el chunk anterior si existe y tiene contenido válido | |
| if current_content: | |
| chunks.append({ | |
| "header": current_header, | |
| "content": "\n".join(current_content), | |
| "source": os.path.basename(pdf_path), | |
| "page": page_num + 1 | |
| }) | |
| current_header = text | |
| current_content = [] | |
| else: | |
| current_content.append(text) | |
| # Añadir el último chunk | |
| if current_content: | |
| chunks.append({ | |
| "header": current_header, | |
| "content": "\n".join(current_content), | |
| "source": os.path.basename(pdf_path), | |
| "page": len(doc) | |
| }) | |
| return chunks | |
| def save_chunks(self, chunks: List[Dict[str, str]], filename: str): | |
| """Guarda los chunks procesados en un archivo JSON.""" | |
| output_path = os.path.join(self.output_dir, f"{filename}.json") | |
| with open(output_path, 'w', encoding='utf-8') as f: | |
| json.dump(chunks, f, ensure_ascii=False, indent=4) | |
| print(f"✅ Guardados {len(chunks)} chunks en {output_path}") | |
| if __name__ == "__main__": | |
| # Iniciar pipeline de ingesta | |
| ingestor = OncoRAGIngestor(output_dir="data/processed/chunks") | |
| # Directorio principal de guías clínicas | |
| guides_dir = "data/clinical_guides" | |
| if os.path.exists(guides_dir): | |
| pdf_files = [] | |
| for root, dirs, files in os.walk(guides_dir): | |
| for file in files: | |
| if file.endswith(".pdf"): | |
| pdf_files.append(os.path.join(root, file)) | |
| if not pdf_files: | |
| print(f"⚠️ El directorio {guides_dir} no contiene PDFs. Agrega los PDFs.") | |
| for path in pdf_files: | |
| file = os.path.basename(path) | |
| # Filtro riguroso: Excluir PDFs diseñados para pacientes | |
| if "patient" in file.lower() or "_pat" in file.lower() or "patient" in path.lower(): | |
| print(f"⏭️ Omitiendo guía para pacientes (riesgo de baja densidad médica): {file}") | |
| continue | |
| print(f"⏳ Procesando: {file}...") | |
| try: | |
| chunks = ingestor.extract_text_semantically(path) | |
| ingestor.save_chunks(chunks, file.replace(".pdf", "")) | |
| except Exception as e: | |
| print(f"❌ Error procesando {file}: {e}") | |
| else: | |
| print(f"⚠️ El directorio {guides_dir} no existe. Crea uno y añade tus PDFs de NCCN/ESMO.") | |
Xet Storage Details
- Size:
- 5.54 kB
- Xet hash:
- d76d6b4811db674737bd5103bb7d9508acf8537e063ec005f2eaddb1ea043cd6
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.