| """ |
| Export utilities for research reports. |
| Supports: DOCX, PDF, Markdown, BibTeX, ZIP (full workspace). |
| Ported from the Next.js original. |
| """ |
|
|
| import os |
| import json |
| import zipfile |
| import tempfile |
| import re |
| from datetime import datetime |
| from typing import Optional, List, Dict, Any |
| import pandas as pd |
|
|
|
|
| def _project_root() -> str: |
| return os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")) |
|
|
|
|
| def _sanitize_key(value: str, fallback: str) -> str: |
| key = re.sub(r'[^a-zA-Z0-9_]', '_', value or "") |
| key = re.sub(r'_+', '_', key).strip('_') |
| return key or fallback |
|
|
|
|
| def _escape_bibtex(value: Any) -> str: |
| text = "" if value is None else str(value) |
| text = text.replace('\u2028', ' ').replace('\u2029', ' ') |
| return text.replace('&', r'\&').replace('%', r'\%').replace('_', r'\_') |
|
|
|
|
| def _doc_authors(doc: Dict[str, Any]) -> str: |
| authors = doc.get("authors", []) |
| if isinstance(authors, list): |
| return " and ".join(str(a) for a in authors if a) or "Unknown" |
| return str(authors or "Unknown") |
|
|
|
|
| def _markdown_to_latex_body(report_md: str) -> str: |
| body = report_md or "" |
| body = re.sub(r'^###\s+(.+)$', r'\\subsection{\1}', body, flags=re.MULTILINE) |
| body = re.sub(r'^##\s+(.+)$', r'\\section{\1}', body, flags=re.MULTILINE) |
| body = re.sub(r'^#\s+(.+)$', r'\\section{\1}', body, flags=re.MULTILINE) |
| body = body.replace('**', '') |
| return body |
|
|
|
|
| def generate_bibtex_from_docs(docs: List[Dict[str, Any]]) -> str: |
| """Generate BibTeX entries from pipeline documents, preserving original GRADE evidence.""" |
| entries = [] |
| seen = set() |
|
|
| for idx, doc in enumerate(docs, 1): |
| title = doc.get("title") or "Untitled" |
| raw_id = doc.get("id") or doc.get("doi") or title |
| cite_key = _sanitize_key(str(raw_id), f"ref{idx}") |
| if cite_key in seen: |
| cite_key = f"{cite_key}_{idx}" |
| seen.add(cite_key) |
|
|
| authors = _doc_authors(doc) |
| year = doc.get("year") or "n.d." |
| doi = doc.get("doi") or doc.get("metadata", {}).get("doi") or "" |
| source = doc.get("source") or doc.get("metadata", {}).get("journal") or "Repository" |
| url = doc.get("url") or doc.get("pdfUrl") or doc.get("handleUrl") or "" |
| evidence = doc.get("evidenceLevel") or doc.get("grade_label") or doc.get("grade_level") or "PENDIENTE" |
|
|
| type_text = str(doc.get("type") or "").lower() |
| title_text = str(title).lower() |
| source_text = str(source).lower() |
| is_thesis = any(k in f"{type_text} {title_text}" for k in [ |
| "tesis", "thesis", "dissertation", "grado", "maestria", "doctorado", "licenciatura", |
| "bachelor", "master", "phd", |
| ]) |
| has_journal_hint = any(k in source_text for k in [ |
| "journal", "revista", "review", "proceedings", "conference", "transactions", |
| ]) |
|
|
| bib_type = "mastersthesis" if is_thesis and not has_journal_hint and not doi else "article" |
| venue_field = "school" if bib_type == "mastersthesis" else "journal" |
|
|
| url_field = f" url = {{{url}}},\n" if url else "" |
| entry = ( |
| f"@{bib_type}{{{cite_key},\n" |
| f" author = {{{_escape_bibtex(authors)}}},\n" |
| f" title = {{{_escape_bibtex(title)}}},\n" |
| f" {venue_field} = {{{_escape_bibtex(source)}}},\n" |
| f" year = {{{_escape_bibtex(year)}}},\n" |
| f"{url_field}" |
| f" doi = {{{_escape_bibtex(doi)}}},\n" |
| f" note = {{Calidad de evidencia GRADE: {_escape_bibtex(evidence)}}}\n" |
| f"}}" |
| ) |
| entries.append(entry) |
|
|
| return "\n\n".join(entries) |
|
|
|
|
| def persist_research_output( |
| report_md: str, |
| docs: List[Dict[str, Any]], |
| query: str, |
| agent_role: str = "general", |
| model: str = "unknown", |
| output_root: Optional[str] = None, |
| ) -> Dict[str, str]: |
| """Persist final pipeline artifacts following the original beta data-mining layout.""" |
| root = output_root or os.path.join(_project_root(), "latex_output") |
| scraping_dir = os.path.join(root, "data", "json1_scraping") |
| outputs_dir = os.path.join(root, "data", "json2_outputs") |
| os.makedirs(scraping_dir, exist_ok=True) |
| os.makedirs(outputs_dir, exist_ok=True) |
| os.makedirs(root, exist_ok=True) |
|
|
| timestamp = datetime.utcnow().isoformat() + "Z" |
| role_name = _sanitize_key((agent_role or "consolidado_investigacion").lower(), "consolidado_investigacion") |
| tex_path = os.path.join(root, f"{role_name}.tex") |
| md_path = os.path.join(root, f"{role_name}.md") |
| bib_path = os.path.join(root, "referencias.bib") |
| scraping_path = os.path.join(scraping_dir, "scraping_data.json") |
| outputs_path = os.path.join(outputs_dir, "llm_outputs.json") |
|
|
| bib = generate_bibtex_from_docs(docs) |
| tex = _markdown_to_latex_body(report_md) |
|
|
| with open(tex_path, "w", encoding="utf-8") as f: |
| f.write(tex) |
| with open(md_path, "w", encoding="utf-8") as f: |
| f.write(report_md or "") |
| with open(bib_path, "w", encoding="utf-8") as f: |
| f.write(bib) |
|
|
| scraping_data = { |
| "version": "1.0.0", |
| "createdAt": timestamp, |
| "lastModifiedAt": timestamp, |
| "projectId": "LETXIPU-GRADIO", |
| "totalRecords": len(docs), |
| "records": [ |
| { |
| "id": doc.get("id") or f"doc_{i}", |
| "url": doc.get("url") or doc.get("pdfUrl") or doc.get("handleUrl") or "", |
| "title": doc.get("title") or "Sin titulo", |
| "snippet": doc.get("snippet") or doc.get("abstract") or "", |
| "source": doc.get("source") or "Desconocido", |
| "scrapedAt": timestamp, |
| "metadata": { |
| "authors": doc.get("authors") or [], |
| "year": int(doc["year"]) if str(doc.get("year", "")).isdigit() else None, |
| "abstract": doc.get("abstract"), |
| "doi": doc.get("doi"), |
| "pdfUrl": doc.get("pdfUrl"), |
| "university": doc.get("university") or doc.get("institution"), |
| "queries": [query], |
| "evidenceLevel": doc.get("evidenceLevel") or doc.get("grade_label") or doc.get("grade_level"), |
| }, |
| } |
| for i, doc in enumerate(docs, 1) |
| ], |
| "changelog": [ |
| { |
| "timestamp": timestamp, |
| "action": "added", |
| "recordCount": len(docs), |
| "description": "Generado automaticamente por el pipeline Python Gradio.", |
| } |
| ], |
| "metadata": { |
| "queryUsed": query, |
| "sourcesEnabled": [], |
| "iterationsCompleted": 1, |
| "totalIterationsPlanned": 1, |
| }, |
| } |
|
|
| output_record = { |
| "id": f"out_{int(datetime.utcnow().timestamp())}", |
| "timestamp": timestamp, |
| "promptUsed": query, |
| "modelUsed": model or "unknown", |
| "agentRole": agent_role, |
| "inputRecordCount": len(docs), |
| "output": {"plainText": report_md or "", "latex": tex}, |
| "sourceScrapingVersion": "1.0.0", |
| } |
| outputs_data = { |
| "version": "1.0.0", |
| "createdAt": timestamp, |
| "lastModifiedAt": timestamp, |
| "projectId": "LETXIPU-GRADIO", |
| "outputs": [output_record], |
| } |
|
|
| with open(scraping_path, "w", encoding="utf-8") as f: |
| json.dump(scraping_data, f, ensure_ascii=False, indent=2) |
| with open(outputs_path, "w", encoding="utf-8") as f: |
| json.dump(outputs_data, f, ensure_ascii=False, indent=2) |
|
|
| return { |
| "tex": tex_path, |
| "markdown": md_path, |
| "bib": bib_path, |
| "scraping_json": scraping_path, |
| "outputs_json": outputs_path, |
| } |
|
|
|
|
| def export_markdown(report_md: str, query: str = "") -> str: |
| """Export report as clean Markdown file.""" |
| timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
| safe_name = re.sub(r'[^\w\s-]', '', query[:40]).strip().replace(' ', '_') or "research" |
| filename = f"{safe_name}_{timestamp}.md" |
| |
| path = os.path.join(tempfile.gettempdir(), filename) |
| |
| header = f"""--- |
| title: "{query}" |
| date: "{datetime.now().isoformat()}" |
| generator: "LETXIPU Research Platform" |
| --- |
| |
| """ |
| with open(path, 'w', encoding='utf-8') as f: |
| f.write(header + report_md) |
| |
| return path |
|
|
|
|
| def export_bibtex(docs_df: pd.DataFrame, query: str = "") -> str: |
| """Export documents as BibTeX references.""" |
| timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
| safe_name = re.sub(r'[^\w\s-]', '', query[:40]).strip().replace(' ', '_') or "references" |
| filename = f"{safe_name}_{timestamp}.bib" |
| path = os.path.join(tempfile.gettempdir(), filename) |
| |
| entries = [] |
| for idx, row in docs_df.iterrows(): |
| title = row.get("Título", "N/A") |
| authors = row.get("Autores", "N/A") |
| year = str(row.get("Año", "")) |
| doi = row.get("DOI", "") |
| source = row.get("Fuente", "") |
| |
| |
| first_author = authors.split(",")[0].strip().split()[-1] if authors else "unknown" |
| cite_key = re.sub(r'[^a-zA-Z0-9]', '', f"{first_author}{year}") |
| if not cite_key: |
| cite_key = f"ref{idx}" |
| |
| entry = f"""@article{{{cite_key}, |
| title = {{{title}}}, |
| author = {{{authors}}}, |
| year = {{{year}}}, |
| doi = {{{doi}}}, |
| journal = {{{source}}}, |
| }}""" |
| entries.append(entry) |
| |
| with open(path, 'w', encoding='utf-8') as f: |
| f.write("\n\n".join(entries)) |
| |
| return path |
|
|
|
|
| def export_zip(report_md: str, docs_df: pd.DataFrame, query: str = "", |
| settings: dict = None) -> str: |
| """Export full workspace as ZIP: report.md + references.bib + documents.csv + settings.json""" |
| timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
| safe_name = re.sub(r'[^\w\s-]', '', query[:40]).strip().replace(' ', '_') or "research" |
| filename = f"{safe_name}_workspace_{timestamp}.zip" |
| path = os.path.join(tempfile.gettempdir(), filename) |
| |
| with zipfile.ZipFile(path, 'w', zipfile.ZIP_DEFLATED) as zf: |
| |
| header = f"---\ntitle: \"{query}\"\ndate: \"{datetime.now().isoformat()}\"\n---\n\n" |
| zf.writestr("report.md", header + report_md) |
| |
| |
| bib_path = export_bibtex(docs_df, query) |
| zf.write(bib_path, "references.bib") |
| |
| |
| csv_content = docs_df.to_csv(index=False, encoding='utf-8') |
| zf.writestr("documents.csv", csv_content) |
| |
| |
| docs_json = docs_df.to_json(orient='records', force_ascii=False, indent=2) |
| zf.writestr("documents.json", docs_json) |
| |
| |
| meta = { |
| "query": query, |
| "timestamp": datetime.now().isoformat(), |
| "total_documents": len(docs_df), |
| "platform": "LETXIPU Research Platform", |
| "settings": settings or {}, |
| } |
| zf.writestr("metadata.json", json.dumps(meta, indent=2, ensure_ascii=False)) |
| |
| return path |
|
|
|
|
| def export_docx(report_md: str, query: str = "") -> Optional[str]: |
| """Export report as DOCX using python-docx if available.""" |
| try: |
| from docx import Document |
| from docx.shared import Pt, Inches |
| from docx.enum.text import WD_ALIGN_PARAGRAPH |
| except ImportError: |
| return None |
| |
| timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
| safe_name = re.sub(r'[^\w\s-]', '', query[:40]).strip().replace(' ', '_') or "research" |
| filename = f"{safe_name}_{timestamp}.docx" |
| path = os.path.join(tempfile.gettempdir(), filename) |
| |
| doc = Document() |
| |
| |
| title_para = doc.add_heading(query or "Informe de Investigación", level=0) |
| title_para.alignment = WD_ALIGN_PARAGRAPH.CENTER |
| |
| doc.add_paragraph( |
| f"Generado: {datetime.now().strftime('%d/%m/%Y %H:%M')} | LETXIPU Research Platform", |
| style='Subtitle' |
| ) |
| doc.add_paragraph("") |
| |
| |
| lines = report_md.split('\n') |
| for line in lines: |
| stripped = line.strip() |
| if not stripped: |
| doc.add_paragraph("") |
| continue |
| |
| if stripped.startswith('#### '): |
| doc.add_heading(stripped[5:], level=4) |
| elif stripped.startswith('### '): |
| doc.add_heading(stripped[4:], level=3) |
| elif stripped.startswith('## '): |
| doc.add_heading(stripped[3:], level=2) |
| elif stripped.startswith('# '): |
| doc.add_heading(stripped[2:], level=1) |
| elif stripped.startswith('- ') or stripped.startswith('* '): |
| doc.add_paragraph(stripped[2:], style='List Bullet') |
| elif re.match(r'^\d+\.\s', stripped): |
| text = re.sub(r'^\d+\.\s', '', stripped) |
| doc.add_paragraph(text, style='List Number') |
| elif stripped.startswith('> '): |
| p = doc.add_paragraph(stripped[2:]) |
| p.style = 'Intense Quote' |
| else: |
| |
| p = doc.add_paragraph() |
| |
| parts = re.split(r'(\*\*.*?\*\*|\*.*?\*)', stripped) |
| for part in parts: |
| if part.startswith('**') and part.endswith('**'): |
| run = p.add_run(part[2:-2]) |
| run.bold = True |
| elif part.startswith('*') and part.endswith('*'): |
| run = p.add_run(part[1:-1]) |
| run.italic = True |
| else: |
| p.add_run(part) |
| |
| doc.save(path) |
| return path |
|
|