""" Export Module - Generate PDF, DOCX, TXT, and HTML downloads from AI responses. Premium formatting with professional layouts and smart content detection. """ import io import re from datetime import datetime from typing import List, Dict, Optional # ======================== TEXT CLEANING ======================== def clean_markdown(text: str) -> str: """Remove markdown formatting for clean document export.""" # Remove bold/italic markers text = re.sub(r'\*\*(.+?)\*\*', r'\1', text) text = re.sub(r'\*(.+?)\*', r'\1', text) text = re.sub(r'__(.+?)__', r'\1', text) text = re.sub(r'_(.+?)_', r'\1', text) # Remove headers markers but keep text text = re.sub(r'^#{1,6}\s+', '', text, flags=re.MULTILINE) # Remove bullet markers text = re.sub(r'^[\-\*]\s+', '- ', text, flags=re.MULTILINE) # Remove numbered lists prefix (keep number) text = re.sub(r'^(\d+)\.\s+', r'\1. ', text, flags=re.MULTILINE) # Remove code blocks markers text = re.sub(r'```[\w]*\n?', '', text) # Remove inline code text = re.sub(r'`(.+?)`', r'\1', text) # Remove links but keep text text = re.sub(r'\[(.+?)\]\(.+?\)', r'\1', text) # Remove emojis (common ones) text = re.sub( r'[🎯✉️📈🧠💡🚀⭐📋🔍💼📊✅❌⚠️🎓🗺️📄🤖👋💬📎📷📚📭🎨🔥💪🏆🌟✨🎉💰📌🔑⚡🛠️🏁📐💎🥇🥈🥉]', '', text ) # Clean up extra whitespace text = re.sub(r'\n{3,}', '\n\n', text) return text.strip() def detect_content_type(text: str) -> str: """Detect the type of content for smart file naming.""" text_lower = text.lower() if any(w in text_lower for w in ['cover letter', 'carta de presentación', 'carta de motivación', 'estimado', 'dear']): return "cover_letter" if any(w in text_lower for w in ['match', 'compatibilidad', 'porcentaje', 'afinidad', '% de match']): return "job_match" if any(w in text_lower for w in ['skills gap', 'habilidades faltantes', 'roadmap', 'skill gap', 'brecha']): return "skills_analysis" if any(w in text_lower for w in ['resumen', 'perfil profesional', 'summary', 'strengths']): return "profile_summary" return "response" def get_smart_filename(text: str, extension: str) -> str: """Generate an intelligent filename based on content type.""" content_type = detect_content_type(text) timestamp = datetime.now().strftime("%Y%m%d_%H%M") type_names = { "cover_letter": "CoverLetter", "job_match": "JobMatch", "skills_analysis": "SkillsAnalysis", "profile_summary": "ProfileSummary", "response": "CareerAI", } name = type_names.get(content_type, "CareerAI") return f"{name}_{timestamp}.{extension}" def get_smart_title(text: str) -> str: """Generate a smart document title based on content.""" content_type = detect_content_type(text) titles = { "cover_letter": "Carta de Presentación", "job_match": "Análisis de Compatibilidad", "skills_analysis": "Análisis de Habilidades", "profile_summary": "Resumen de Perfil", "response": "Respuesta CareerAI", } return titles.get(content_type, "Respuesta CareerAI") # ======================== MARKDOWN PARSER ======================== def parse_markdown_blocks(text: str) -> list: """ Parse markdown into structured blocks for rich document rendering. Returns list of dicts: {type, content, level} """ blocks = [] lines = text.split('\n') i = 0 while i < len(lines): line = lines[i] # Headers header_match = re.match(r'^(#{1,6})\s+(.+)', line) if header_match: level = len(header_match.group(1)) blocks.append({ 'type': 'header', 'content': header_match.group(2).strip(), 'level': level }) i += 1 continue # Horizontal rules if re.match(r'^[\-\*\_]{3,}\s*$', line): blocks.append({'type': 'hr', 'content': '', 'level': 0}) i += 1 continue # Bullet lists bullet_match = re.match(r'^[\-\*]\s+(.+)', line) if bullet_match: items = [bullet_match.group(1).strip()] i += 1 while i < len(lines): next_bullet = re.match(r'^[\-\*]\s+(.+)', lines[i]) if next_bullet: items.append(next_bullet.group(1).strip()) i += 1 else: break blocks.append({'type': 'bullet_list', 'content': items, 'level': 0}) continue # Numbered lists num_match = re.match(r'^(\d+)\.\s+(.+)', line) if num_match: items = [num_match.group(2).strip()] i += 1 while i < len(lines): next_num = re.match(r'^\d+\.\s+(.+)', lines[i]) if next_num: items.append(next_num.group(1).strip()) i += 1 else: break blocks.append({'type': 'numbered_list', 'content': items, 'level': 0}) continue # Code blocks if line.strip().startswith('```'): lang = line.strip()[3:] code_lines = [] i += 1 while i < len(lines) and not lines[i].strip().startswith('```'): code_lines.append(lines[i]) i += 1 if i < len(lines): i += 1 # skip closing ``` blocks.append({ 'type': 'code', 'content': '\n'.join(code_lines), 'level': 0, 'lang': lang }) continue # Bold/emphasis lines (like "**Sección:**") bold_match = re.match(r'^\*\*(.+?)\*\*:?\s*$', line.strip()) if bold_match: blocks.append({ 'type': 'bold_heading', 'content': bold_match.group(1).strip(), 'level': 0 }) i += 1 continue # Empty lines if not line.strip(): i += 1 continue # Regular paragraph (collect consecutive lines) para_lines = [line] i += 1 while i < len(lines) and lines[i].strip() and not re.match(r'^#{1,6}\s+', lines[i]) \ and not re.match(r'^[\-\*]\s+', lines[i]) and not re.match(r'^\d+\.\s+', lines[i]) \ and not lines[i].strip().startswith('```') and not re.match(r'^\*\*(.+?)\*\*:?\s*$', lines[i].strip()): para_lines.append(lines[i]) i += 1 blocks.append({ 'type': 'paragraph', 'content': ' '.join(para_lines), 'level': 0 }) return blocks def strip_inline_md(text: str) -> str: """Remove inline markdown (bold, italic, code, links) from text.""" text = re.sub(r'\*\*(.+?)\*\*', r'\1', text) text = re.sub(r'\*(.+?)\*', r'\1', text) text = re.sub(r'__(.+?)__', r'\1', text) text = re.sub(r'_(.+?)_', r'\1', text) text = re.sub(r'`(.+?)`', r'\1', text) text = re.sub(r'\[(.+?)\]\(.+?\)', r'\1', text) return text def _sanitize_for_pdf(text: str) -> str: """Replace Unicode characters with ASCII equivalents for PDF Helvetica font.""" replacements = { '\u2022': '-', '\u2013': '-', '\u2014': '--', '\u2018': "'", '\u2019': "'", '\u201c': '"', '\u201d': '"', '\u2026': '...', '\u2192': '->', '\u2190': '<-', '\u00b7': '-', '\u2500': '-', '\u2501': '-', '\u25cf': '-', '\u2605': '*', '\u2713': 'v', '\u2717': 'x', } for char, replacement in replacements.items(): text = text.replace(char, replacement) text = re.sub( r'[\U0001F300-\U0001F9FF\U00002702-\U000027B0\U0000FE00-\U0000FE0F\U0001FA00-\U0001FA6F\U0001FA70-\U0001FAFF]', '', text ) return text # ======================== PDF EXPORT ======================== def export_to_pdf(text: str, title: Optional[str] = None) -> bytes: """Export text content to a premium-styled PDF.""" try: from fpdf import FPDF except ImportError: raise ValueError("Instala fpdf2: pip install fpdf2") if title is None: title = get_smart_title(text) pdf = FPDF() pdf.set_auto_page_break(auto=True, margin=25) pdf.add_page() page_width = pdf.w - 40 # margins # ---- Header Band ---- pdf.set_fill_color(88, 60, 200) pdf.rect(0, 0, 210, 3, 'F') # ---- Title ---- pdf.set_y(15) pdf.set_font("Helvetica", "B", 20) pdf.set_text_color(88, 60, 200) pdf.cell(0, 12, title, new_x="LMARGIN", new_y="NEXT", align="C") pdf.ln(1) # ---- Subtitle / Date ---- pdf.set_font("Helvetica", "", 9) pdf.set_text_color(140, 140, 150) date_str = datetime.now().strftime("%d de %B, %Y | %H:%M") pdf.cell(0, 6, f"Generado el {date_str}", new_x="LMARGIN", new_y="NEXT", align="C") pdf.ln(2) # ---- Divider ---- y = pdf.get_y() pdf.set_draw_color(200, 200, 215) pdf.set_line_width(0.3) # Gradient-like effect with multiple lines pdf.set_draw_color(88, 60, 200) pdf.line(70, y, 140, y) pdf.set_draw_color(200, 200, 215) pdf.line(40, y + 0.5, 170, y + 0.5) pdf.ln(10) # ---- Content Blocks ---- blocks = parse_markdown_blocks(text) for block in blocks: btype = block['type'] if btype == 'header': level = block['level'] content = _sanitize_for_pdf(strip_inline_md(block['content'])) pdf.ln(4) if level == 1: pdf.set_font("Helvetica", "B", 16) pdf.set_text_color(30, 30, 45) elif level == 2: pdf.set_font("Helvetica", "B", 14) pdf.set_text_color(88, 60, 200) elif level == 3: pdf.set_font("Helvetica", "B", 12) pdf.set_text_color(60, 60, 80) else: pdf.set_font("Helvetica", "B", 11) pdf.set_text_color(80, 80, 100) pdf.multi_cell(0, 7, content) pdf.ln(2) elif btype == 'bold_heading': content = _sanitize_for_pdf(strip_inline_md(block['content'])) pdf.ln(3) pdf.set_font("Helvetica", "B", 12) pdf.set_text_color(88, 60, 200) pdf.multi_cell(0, 7, content) pdf.set_text_color(30, 30, 45) pdf.ln(1) elif btype == 'paragraph': content = _sanitize_for_pdf(strip_inline_md(block['content'])) pdf.set_font("Helvetica", "", 10.5) pdf.set_text_color(40, 40, 50) pdf.multi_cell(0, 5.5, content) pdf.ln(3) elif btype == 'bullet_list': for item in block['content']: item_clean = _sanitize_for_pdf(strip_inline_md(item)) pdf.set_font("Helvetica", "", 10.5) pdf.set_text_color(88, 60, 200) pdf.cell(8, 5.5, "-") pdf.set_text_color(40, 40, 50) pdf.multi_cell(0, 5.5, f" {item_clean}") pdf.ln(1) pdf.ln(2) elif btype == 'numbered_list': for idx, item in enumerate(block['content'], 1): item_clean = _sanitize_for_pdf(strip_inline_md(item)) pdf.set_font("Helvetica", "B", 10.5) pdf.set_text_color(88, 60, 200) pdf.cell(10, 5.5, f"{idx}.") pdf.set_font("Helvetica", "", 10.5) pdf.set_text_color(40, 40, 50) pdf.multi_cell(0, 5.5, f" {item_clean}") pdf.ln(1) pdf.ln(2) elif btype == 'code': pdf.ln(2) # Code block background pdf.set_fill_color(245, 245, 248) pdf.set_font("Courier", "", 9) pdf.set_text_color(60, 60, 80) code_lines = block['content'].split('\n') for cl in code_lines: pdf.cell(0, 5, f" {cl}", new_x="LMARGIN", new_y="NEXT", fill=True) pdf.ln(3) elif btype == 'hr': pdf.ln(3) y = pdf.get_y() pdf.set_draw_color(200, 200, 215) pdf.line(20, y, 190, y) pdf.ln(5) # ---- Footer ---- pdf.ln(10) y = pdf.get_y() pdf.set_draw_color(88, 60, 200) pdf.line(60, y, 150, y) pdf.ln(6) pdf.set_font("Helvetica", "I", 8) pdf.set_text_color(160, 160, 175) pdf.cell(0, 5, "Generado por CareerAI - Asistente de Carrera con IA", align="C") pdf.ln(4) pdf.set_font("Helvetica", "", 7) pdf.cell(0, 4, "Powered by RAG + Llama 3.3 + ChromaDB", align="C") # Bottom band pdf.set_fill_color(88, 60, 200) pdf.rect(0, 294, 210, 3, 'F') return pdf.output() # ======================== DOCX EXPORT ======================== def export_to_docx(text: str, title: Optional[str] = None) -> bytes: """Export text content to a professionally styled DOCX.""" try: from docx import Document from docx.shared import Pt, RGBColor, Inches, Cm from docx.enum.text import WD_ALIGN_PARAGRAPH from docx.oxml.ns import qn from docx.oxml import OxmlElement except ImportError: raise ValueError("Instala python-docx: pip install python-docx") if title is None: title = get_smart_title(text) doc = Document() # ---- Page margins ---- for section in doc.sections: section.top_margin = Cm(2) section.bottom_margin = Cm(2) section.left_margin = Cm(2.5) section.right_margin = Cm(2.5) # ---- Default font ---- style = doc.styles['Normal'] font = style.font font.name = 'Calibri' font.size = Pt(11) font.color.rgb = RGBColor(40, 40, 50) # ---- Accent line ---- accent_para = doc.add_paragraph() accent_para.alignment = WD_ALIGN_PARAGRAPH.CENTER accent_run = accent_para.add_run("━" * 40) accent_run.font.color.rgb = RGBColor(88, 60, 200) accent_run.font.size = Pt(6) # ---- Title ---- title_para = doc.add_paragraph() title_para.alignment = WD_ALIGN_PARAGRAPH.CENTER title_para.space_after = Pt(4) title_run = title_para.add_run(title) title_run.font.size = Pt(22) title_run.font.bold = True title_run.font.color.rgb = RGBColor(88, 60, 200) title_run.font.name = 'Calibri Light' # ---- Date ---- date_para = doc.add_paragraph() date_para.alignment = WD_ALIGN_PARAGRAPH.CENTER date_para.space_after = Pt(2) date_str = datetime.now().strftime("%d de %B, %Y • %H:%M") date_run = date_para.add_run(f"Generado el {date_str}") date_run.font.size = Pt(9) date_run.font.color.rgb = RGBColor(140, 140, 150) # ---- Divider ---- div_para = doc.add_paragraph() div_para.alignment = WD_ALIGN_PARAGRAPH.CENTER div_para.space_after = Pt(12) div_run = div_para.add_run("─" * 50) div_run.font.color.rgb = RGBColor(200, 200, 215) div_run.font.size = Pt(8) # ---- Content Blocks ---- blocks = parse_markdown_blocks(text) for block in blocks: btype = block['type'] if btype == 'header': level = block['level'] content = strip_inline_md(block['content']) p = doc.add_paragraph() p.space_before = Pt(12) p.space_after = Pt(4) run = p.add_run(content) run.font.bold = True if level == 1: run.font.size = Pt(18) run.font.color.rgb = RGBColor(30, 30, 45) elif level == 2: run.font.size = Pt(15) run.font.color.rgb = RGBColor(88, 60, 200) elif level == 3: run.font.size = Pt(13) run.font.color.rgb = RGBColor(60, 60, 80) else: run.font.size = Pt(12) run.font.color.rgb = RGBColor(80, 80, 100) elif btype == 'bold_heading': content = strip_inline_md(block['content']) p = doc.add_paragraph() p.space_before = Pt(8) p.space_after = Pt(2) run = p.add_run(content) run.font.bold = True run.font.size = Pt(12) run.font.color.rgb = RGBColor(88, 60, 200) elif btype == 'paragraph': content = strip_inline_md(block['content']) p = doc.add_paragraph(content) p.paragraph_format.line_spacing = Pt(16) p.space_after = Pt(6) elif btype == 'bullet_list': for item in block['content']: item_clean = strip_inline_md(item) p = doc.add_paragraph(item_clean, style='List Bullet') p.paragraph_format.line_spacing = Pt(15) elif btype == 'numbered_list': for item in block['content']: item_clean = strip_inline_md(item) p = doc.add_paragraph(item_clean, style='List Number') p.paragraph_format.line_spacing = Pt(15) elif btype == 'code': code_para = doc.add_paragraph() code_para.space_before = Pt(6) code_para.space_after = Pt(6) # Add shading to code block shading = OxmlElement('w:shd') shading.set(qn('w:fill'), 'F5F5F8') shading.set(qn('w:val'), 'clear') code_para.paragraph_format.element.get_or_add_pPr().append(shading) run = code_para.add_run(block['content']) run.font.name = 'Consolas' run.font.size = Pt(9) run.font.color.rgb = RGBColor(60, 60, 80) elif btype == 'hr': hr_para = doc.add_paragraph() hr_para.alignment = WD_ALIGN_PARAGRAPH.CENTER hr_run = hr_para.add_run("─" * 50) hr_run.font.color.rgb = RGBColor(200, 200, 215) hr_run.font.size = Pt(8) # ---- Footer ---- div_para2 = doc.add_paragraph() div_para2.alignment = WD_ALIGN_PARAGRAPH.CENTER div_para2.space_before = Pt(20) div_run2 = div_para2.add_run("─" * 50) div_run2.font.color.rgb = RGBColor(200, 200, 215) div_run2.font.size = Pt(8) footer_para = doc.add_paragraph() footer_para.alignment = WD_ALIGN_PARAGRAPH.CENTER footer_run = footer_para.add_run( "Generado por CareerAI — Asistente de Carrera con IA" ) footer_run.font.size = Pt(8) footer_run.font.italic = True footer_run.font.color.rgb = RGBColor(160, 160, 175) sub_para = doc.add_paragraph() sub_para.alignment = WD_ALIGN_PARAGRAPH.CENTER sub_run = sub_para.add_run("Powered by RAG + Llama 3.3 + ChromaDB") sub_run.font.size = Pt(7) sub_run.font.color.rgb = RGBColor(180, 180, 195) # ---- Save to bytes ---- buffer = io.BytesIO() doc.save(buffer) buffer.seek(0) return buffer.getvalue() # ======================== TXT EXPORT ======================== def export_to_txt(text: str) -> bytes: """Export text content as a clean, well-formatted TXT file.""" clean = clean_markdown(text) title = get_smart_title(text) date_str = datetime.now().strftime('%d/%m/%Y %H:%M') header = ( f"{'=' * 60}\n" f" {title}\n" f" Generado: {date_str}\n" f" CareerAI — Asistente de Carrera con IA\n" f"{'=' * 60}\n\n" ) footer = ( f"\n\n{'─' * 60}\n" f"Generado por CareerAI | Powered by RAG + Llama 3.3\n" ) return (header + clean + footer).encode("utf-8") # ======================== HTML EXPORT ======================== def export_to_html(text: str, title: Optional[str] = None) -> bytes: """Export text content as a beautifully styled standalone HTML file.""" import html as html_lib if title is None: title = get_smart_title(text) date_str = datetime.now().strftime("%d de %B, %Y • %H:%M") # Convert markdown to HTML-like content blocks = parse_markdown_blocks(text) content_html = "" for block in blocks: btype = block['type'] if btype == 'header': level = block['level'] content = html_lib.escape(strip_inline_md(block['content'])) tag = f"h{min(level + 1, 6)}" # shift down since h1 is title content_html += f"<{tag}>{content}{tag}>\n" elif btype == 'bold_heading': content = html_lib.escape(strip_inline_md(block['content'])) content_html += f'
{content}
\n" elif btype == 'bullet_list': content_html += "{code_content}\n'
elif btype == 'hr':
content_html += '