Cybersecurity-Panel

Running

App Files Files Community

Girish Jeswani commited on Jul 18, 2025

Commit

b206ba3

1 Parent(s): a3ae13f

fix pdf formatting

Browse files

Files changed (2) hide show

multi_llm_chatbot_backend/app/utils/chat_summary.py +102 -15
multi_llm_chatbot_backend/app/utils/file_export.py +128 -46

multi_llm_chatbot_backend/app/utils/chat_summary.py CHANGED Viewed

@@ -15,7 +15,21 @@ async def generate_summary_from_messages(messages: List[dict], llm: LLMClient, m
         system_prompt = (
             "You are an academic assistant. Summarize the following PhD chat conversation "
-            "into concise bullet points (max 10) or short paragraphs. Focus on insights, questions, and advice."
         )
         context = [{"role": "user", "content": f"Chat Log:\n{full_text}"}]
@@ -27,17 +41,49 @@ async def generate_summary_from_messages(messages: List[dict], llm: LLMClient, m
             max_tokens=max_tokens
         )
-        return summary.strip()
     except Exception as e:
         logger.error(f"Error generating summary: {str(e)}")
         return "Summary generation failed. Please try again later."
-def parse_summary_to_blocks(summary_text: str) -> List[Dict]:
-    #summary_text = re.sub(r'(?<!\n)([*•] )', r'\n\1', summary_text)
-    #summary_text = re.sub(r'(?<!\n)(\d+\.\s+)', r'\n\1', summary_text)
-    #summary_text = re.sub(r'(?<=[.!?])(?=\S)', ' ', summary_text)
     lines = summary_text.strip().splitlines()
     blocks = []
     current_block = None
@@ -51,7 +97,7 @@ def parse_summary_to_blocks(summary_text: str) -> List[Dict]:
         if not line:
             continue
-        # Match section headings (e.g. **Title:**) as heading block
         heading_match = re.match(r'^\*\*(.+?)\*\*:?$', line)
         if heading_match:
             flush_current_block()
@@ -60,16 +106,17 @@ def parse_summary_to_blocks(summary_text: str) -> List[Dict]:
             current_block = None
             continue
-        # Match bullet list
-        if line.startswith("* "):
             if current_block is None or current_block["type"] != "list" or current_block.get("style") != "bullet":
                 flush_current_block()
                 current_block = {"type": "list", "style": "bullet", "items": []}
-            current_block["items"].append(line[2:].strip())
             continue
-        # Match numbered list
-        number_match = re.match(r'^\d+\.\s+(.*)', line)
         if number_match:
             if current_block is None or current_block["type"] != "list" or current_block.get("style") != "numbered":
                 flush_current_block()
@@ -85,8 +132,48 @@ def parse_summary_to_blocks(summary_text: str) -> List[Dict]:
     flush_current_block()
-    import pprint
-    print("[DEBUG] Summary Blocks:")
-    pprint.pprint(blocks)
     return blocks

         system_prompt = (
             "You are an academic assistant. Summarize the following PhD chat conversation "
+            "into a well-formatted summary with clear bullet points. "
+            "Please format your response as follows:\n"
+            "- Use bullet points (starting with *) for key insights\n"
+            "- Put each bullet point on a separate line\n"
+            "- Include section headings if appropriate (formatted as **Section Name:**)\n"
+            "- Focus on insights, questions, and actionable advice\n"
+            "- Maximum 10 bullet points\n\n"
+            "Example format:\n"
+            "**Key Insights:**\n"
+            "* First main point about the conversation\n"
+            "* Second important insight\n"
+            "* Third key takeaway\n\n"
+            "**Recommendations:**\n"
+            "* First actionable recommendation\n"
+            "* Second suggestion"
         )
         context = [{"role": "user", "content": f"Chat Log:\n{full_text}"}]
             max_tokens=max_tokens
         )
+        # Post-process the summary to ensure proper formatting
+        formatted_summary = _format_summary_text(summary.strip())
+        return formatted_summary
     except Exception as e:
         logger.error(f"Error generating summary: {str(e)}")
         return "Summary generation failed. Please try again later."
+def _format_summary_text(summary_text: str) -> str:
+    """
+    Post-process the summary text to ensure proper bullet point formatting.
+    """
+    # Fix common formatting issues
+    # Add line breaks before bullet points that don't have them
+    summary_text = re.sub(r'(?<!\n)([*•] )', r'\n\1', summary_text)
+    # Add line breaks before numbered lists that don't have them
+    summary_text = re.sub(r'(?<!\n)(\d+\.\s+)', r'\n\1', summary_text)
+    # Add line breaks after periods followed by capital letters (likely new sentences)
+    summary_text = re.sub(r'(?<=[.!?])(?=\s*[*•]\s)', '\n', summary_text)
+    # Clean up multiple consecutive newlines
+    summary_text = re.sub(r'\n{3,}', '\n\n', summary_text)
+    # Ensure bullet points are properly spaced
+    summary_text = re.sub(r'\n([*•] )', r'\n\n\1', summary_text)
+    # Fix section headings that might be run together
+    summary_text = re.sub(r'([.!?])\s*(\*\*[^*]+\*\*)', r'\1\n\n\2', summary_text)
+    return summary_text.strip()
+def parse_summary_to_blocks(summary_text: str) -> List[Dict]:
+    """
+    Parse summary text into structured blocks for better formatting.
+    """
+    # First, ensure proper formatting
+    summary_text = _format_summary_text(summary_text)
     lines = summary_text.strip().splitlines()
     blocks = []
     current_block = None
         if not line:
             continue
+        # Match section headings (e.g. **Title:** or **Title**)
         heading_match = re.match(r'^\*\*(.+?)\*\*:?$', line)
         if heading_match:
             flush_current_block()
             current_block = None
             continue
+        # Match bullet list items (*, •, or -)
+        bullet_match = re.match(r'^[*•-]\s+(.+)', line)
+        if bullet_match:
             if current_block is None or current_block["type"] != "list" or current_block.get("style") != "bullet":
                 flush_current_block()
                 current_block = {"type": "list", "style": "bullet", "items": []}
+            current_block["items"].append(bullet_match.group(1).strip())
             continue
+        # Match numbered list items
+        number_match = re.match(r'^\d+\.\s+(.+)', line)
         if number_match:
             if current_block is None or current_block["type"] != "list" or current_block.get("style") != "numbered":
                 flush_current_block()
     flush_current_block()
+    # Debug output to help troubleshoot
+    logger.info(f"[DEBUG] Parsed {len(blocks)} blocks from summary")
+    for i, block in enumerate(blocks):
+        if block["type"] == "list":
+            logger.info(f"Block {i}: {block['type']} ({block['style']}) with {len(block['items'])} items")
+        else:
+            logger.info(f"Block {i}: {block['type']}")
     return blocks
+def format_summary_for_text_export(summary_text: str) -> str:
+    """
+    Format summary text specifically for TXT and DOCX exports with proper line breaks.
+    """
+    formatted_text = _format_summary_text(summary_text)
+    # Add extra spacing for better readability in text formats
+    lines = formatted_text.split('\n')
+    formatted_lines = []
+    for line in lines:
+        line = line.strip()
+        if not line:
+            continue
+        # Add extra space before section headings
+        if re.match(r'^\*\*(.+?)\*\*:?$', line):
+            if formatted_lines:  # Don't add space before first heading
+                formatted_lines.append('')
+            formatted_lines.append(line)
+            formatted_lines.append('')  # Space after heading
+        # Add space before bullet points (but group them together)
+        elif re.match(r'^[*•-]\s+', line):
+            # Check if previous line was also a bullet point
+            if formatted_lines and not re.match(r'^[*•-]\s+', formatted_lines[-1]):
+                formatted_lines.append('')  # Space before first bullet in group
+            formatted_lines.append(line)
+        else:
+            # Regular paragraph
+            if formatted_lines:
+                formatted_lines.append('')
+            formatted_lines.append(line)
+    return '\n'.join(formatted_lines)

multi_llm_chatbot_backend/app/utils/file_export.py CHANGED Viewed

@@ -7,6 +7,7 @@ from fastapi.responses import StreamingResponse
 from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, ListFlowable, ListItem
 from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
 from reportlab.lib.enums import TA_CENTER, TA_LEFT
 from io import BytesIO
 import re
@@ -36,63 +37,101 @@ def generate_docx_file(text: str) -> BytesIO:
     return buffer
 def generate_pdf_file(text: str) -> BytesIO:
     buffer = BytesIO()
-    doc = SimpleDocTemplate(buffer, pagesize=letter)
     styles = getSampleStyleSheet()
     story = []
-    for block in text.split("\n\n"):
-        story.append(Paragraph(block.strip(), styles["Normal"]))
         story.append(Spacer(1, 12))
     doc.build(story)
     buffer.seek(0)
     return buffer
-def export_chat_as_file(content: Union[str, List[dict]], format: str) -> Tuple[BytesIO, str, str]:
-    """
-    Export either a list of chat messages or a summary string to the specified format.
-    """
-    if isinstance(content, list):
-        text = format_messages_for_export(content)
-    elif isinstance(content, str):
-        text = content.strip()
-    else:
-        raise ValueError("Unsupported content type")
-    if format == "txt":
-        return generate_txt_file(text), "chat_export.txt", "text/plain"
-    elif format == "docx":
-        return generate_docx_file(text), "chat_export.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
-    elif format == "pdf":
-        return generate_pdf_file(text), "chat_export.pdf", "application/pdf"
-    else:
-        raise ValueError(f"Unsupported export format: {format}")
-def prepare_export_response(
-    content: Union[str, List[dict]],
-    format: str,
-    filename_prefix: str = "chat_export"
-) -> StreamingResponse:
-    """
-    Prepare a StreamingResponse for export, using the given filename prefix.
-    """
-    stream, filename, media_type = export_chat_as_file(content, format)
-    # Replace "chat_export" with custom prefix if needed
-    final_filename = filename.replace("chat_export", filename_prefix)
-    return StreamingResponse(
-        stream,
-        media_type=media_type,
-        headers={"Content-Disposition": f"attachment; filename={final_filename}"}
-    )
 def _render_rich_text(text: str) -> str:
     """
@@ -158,3 +197,46 @@ def generate_pdf_file_from_blocks(blocks: List[dict]) -> BytesIO:
     buffer.seek(0)
     return buffer

 from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, ListFlowable, ListItem
 from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
 from reportlab.lib.enums import TA_CENTER, TA_LEFT
+from reportlab.lib.units import inch
 from io import BytesIO
 import re
     return buffer
+def _clean_text_for_pdf(text: str) -> str:
+    """
+    Clean text for PDF generation to handle special characters and formatting.
+    """
+    # Remove or replace problematic characters
+    text = text.replace('\u2019', "'")  # Smart apostrophe
+    text = text.replace('\u2018', "'")  # Smart apostrophe
+    text = text.replace('\u201c', '"')  # Smart quote
+    text = text.replace('\u201d', '"')  # Smart quote
+    text = text.replace('\u2013', '-')  # En dash
+    text = text.replace('\u2014', '-')  # Em dash
+    # Handle markdown-style formatting
+    text = re.sub(r'\*\*(.+?)\*\*', r'<b>\1</b>', text)  # Bold
+    text = re.sub(r'\*(.+?)\*', r'<i>\1</i>', text)      # Italic
+    return text
 def generate_pdf_file(text: str) -> BytesIO:
+    """
+    Improved PDF generation with proper text wrapping and formatting.
+    """
     buffer = BytesIO()
+    doc = SimpleDocTemplate(
+        buffer,
+        pagesize=letter,
+        leftMargin=inch,
+        rightMargin=inch,
+        topMargin=inch,
+        bottomMargin=inch
+    )
     styles = getSampleStyleSheet()
+    # Create custom styles for better formatting
+    role_style = ParagraphStyle(
+        name="RoleStyle",
+        parent=styles["Normal"],
+        fontSize=12,
+        fontName="Helvetica-Bold",
+        spaceAfter=6,
+        textColor='blue'
+    )
+    content_style = ParagraphStyle(
+        name="ContentStyle",
+        parent=styles["Normal"],
+        fontSize=10,
+        fontName="Helvetica",
+        leading=14,  # Line spacing
+        spaceAfter=12,
+        leftIndent=20
+    )
     story = []
+    # Split text into message blocks
+    blocks = text.split("\n\n")
+    for block in blocks:
+        if not block.strip():
+            continue
+        # Clean the text for PDF
+        clean_block = _clean_text_for_pdf(block.strip())
+        # Check if this is a role indicator (user:, assistant:, etc.)
+        lines = clean_block.split('\n', 1)
+        if len(lines) > 1 and lines[0].strip().endswith(':'):
+            # This is a role header
+            role = lines[0].strip()
+            content = lines[1].strip() if len(lines) > 1 else ""
+            # Add role header
+            story.append(Paragraph(role, role_style))
+            # Add content if it exists
+            if content:
+                # Split long content into smaller paragraphs for better formatting
+                content_paragraphs = content.split('\n')
+                for para in content_paragraphs:
+                    if para.strip():
+                        story.append(Paragraph(para.strip(), content_style))
+        else:
+            # Regular content block
+            story.append(Paragraph(clean_block, content_style))
+        # Add some space between message blocks
         story.append(Spacer(1, 12))
     doc.build(story)
     buffer.seek(0)
     return buffer
 def _render_rich_text(text: str) -> str:
     """
     buffer.seek(0)
     return buffer
+def export_chat_as_file(content: Union[str, List[dict]], format: str) -> Tuple[BytesIO, str, str]:
+    """
+    Export either a list of chat messages or a summary string to the specified format.
+    """
+    if isinstance(content, list):
+        text = format_messages_for_export(content)
+    elif isinstance(content, str):
+        text = content.strip()
+    else:
+        raise ValueError("Unsupported content type")
+    if format == "txt":
+        return generate_txt_file(text), "chat_export.txt", "text/plain"
+    elif format == "docx":
+        return generate_docx_file(text), "chat_export.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
+    elif format == "pdf":
+        return generate_pdf_file(text), "chat_export.pdf", "application/pdf"
+    else:
+        raise ValueError(f"Unsupported export format: {format}")
+def prepare_export_response(
+    content: Union[str, List[dict]],
+    format: str,
+    filename_prefix: str = "chat_export"
+) -> StreamingResponse:
+    """
+    Prepare a StreamingResponse for export, using the given filename prefix.
+    """
+    stream, filename, media_type = export_chat_as_file(content, format)
+    # Replace "chat_export" with custom prefix if needed
+    final_filename = filename.replace("chat_export", filename_prefix)
+    return StreamingResponse(
+        stream,
+        media_type=media_type,
+        headers={"Content-Disposition": f"attachment; filename={final_filename}"}
+    )