Girish Jeswani commited on
Commit
b206ba3
·
1 Parent(s): a3ae13f

fix pdf formatting

Browse files
multi_llm_chatbot_backend/app/utils/chat_summary.py CHANGED
@@ -15,7 +15,21 @@ async def generate_summary_from_messages(messages: List[dict], llm: LLMClient, m
15
 
16
  system_prompt = (
17
  "You are an academic assistant. Summarize the following PhD chat conversation "
18
- "into concise bullet points (max 10) or short paragraphs. Focus on insights, questions, and advice."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  )
20
 
21
  context = [{"role": "user", "content": f"Chat Log:\n{full_text}"}]
@@ -27,17 +41,49 @@ async def generate_summary_from_messages(messages: List[dict], llm: LLMClient, m
27
  max_tokens=max_tokens
28
  )
29
 
30
- return summary.strip()
 
 
31
 
32
  except Exception as e:
33
  logger.error(f"Error generating summary: {str(e)}")
34
  return "Summary generation failed. Please try again later."
35
 
36
- def parse_summary_to_blocks(summary_text: str) -> List[Dict]:
37
- #summary_text = re.sub(r'(?<!\n)([*•] )', r'\n\1', summary_text)
38
- #summary_text = re.sub(r'(?<!\n)(\d+\.\s+)', r'\n\1', summary_text)
39
- #summary_text = re.sub(r'(?<=[.!?])(?=\S)', ' ', summary_text)
40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  lines = summary_text.strip().splitlines()
42
  blocks = []
43
  current_block = None
@@ -51,7 +97,7 @@ def parse_summary_to_blocks(summary_text: str) -> List[Dict]:
51
  if not line:
52
  continue
53
 
54
- # Match section headings (e.g. **Title:**) as heading block
55
  heading_match = re.match(r'^\*\*(.+?)\*\*:?$', line)
56
  if heading_match:
57
  flush_current_block()
@@ -60,16 +106,17 @@ def parse_summary_to_blocks(summary_text: str) -> List[Dict]:
60
  current_block = None
61
  continue
62
 
63
- # Match bullet list
64
- if line.startswith("* "):
 
65
  if current_block is None or current_block["type"] != "list" or current_block.get("style") != "bullet":
66
  flush_current_block()
67
  current_block = {"type": "list", "style": "bullet", "items": []}
68
- current_block["items"].append(line[2:].strip())
69
  continue
70
 
71
- # Match numbered list
72
- number_match = re.match(r'^\d+\.\s+(.*)', line)
73
  if number_match:
74
  if current_block is None or current_block["type"] != "list" or current_block.get("style") != "numbered":
75
  flush_current_block()
@@ -85,8 +132,48 @@ def parse_summary_to_blocks(summary_text: str) -> List[Dict]:
85
 
86
  flush_current_block()
87
 
88
- import pprint
89
- print("[DEBUG] Summary Blocks:")
90
- pprint.pprint(blocks)
 
 
 
 
 
91
  return blocks
92
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
  system_prompt = (
17
  "You are an academic assistant. Summarize the following PhD chat conversation "
18
+ "into a well-formatted summary with clear bullet points. "
19
+ "Please format your response as follows:\n"
20
+ "- Use bullet points (starting with *) for key insights\n"
21
+ "- Put each bullet point on a separate line\n"
22
+ "- Include section headings if appropriate (formatted as **Section Name:**)\n"
23
+ "- Focus on insights, questions, and actionable advice\n"
24
+ "- Maximum 10 bullet points\n\n"
25
+ "Example format:\n"
26
+ "**Key Insights:**\n"
27
+ "* First main point about the conversation\n"
28
+ "* Second important insight\n"
29
+ "* Third key takeaway\n\n"
30
+ "**Recommendations:**\n"
31
+ "* First actionable recommendation\n"
32
+ "* Second suggestion"
33
  )
34
 
35
  context = [{"role": "user", "content": f"Chat Log:\n{full_text}"}]
 
41
  max_tokens=max_tokens
42
  )
43
 
44
+ # Post-process the summary to ensure proper formatting
45
+ formatted_summary = _format_summary_text(summary.strip())
46
+ return formatted_summary
47
 
48
  except Exception as e:
49
  logger.error(f"Error generating summary: {str(e)}")
50
  return "Summary generation failed. Please try again later."
51
 
 
 
 
 
52
 
53
+ def _format_summary_text(summary_text: str) -> str:
54
+ """
55
+ Post-process the summary text to ensure proper bullet point formatting.
56
+ """
57
+ # Fix common formatting issues
58
+
59
+ # Add line breaks before bullet points that don't have them
60
+ summary_text = re.sub(r'(?<!\n)([*•] )', r'\n\1', summary_text)
61
+
62
+ # Add line breaks before numbered lists that don't have them
63
+ summary_text = re.sub(r'(?<!\n)(\d+\.\s+)', r'\n\1', summary_text)
64
+
65
+ # Add line breaks after periods followed by capital letters (likely new sentences)
66
+ summary_text = re.sub(r'(?<=[.!?])(?=\s*[*•]\s)', '\n', summary_text)
67
+
68
+ # Clean up multiple consecutive newlines
69
+ summary_text = re.sub(r'\n{3,}', '\n\n', summary_text)
70
+
71
+ # Ensure bullet points are properly spaced
72
+ summary_text = re.sub(r'\n([*•] )', r'\n\n\1', summary_text)
73
+
74
+ # Fix section headings that might be run together
75
+ summary_text = re.sub(r'([.!?])\s*(\*\*[^*]+\*\*)', r'\1\n\n\2', summary_text)
76
+
77
+ return summary_text.strip()
78
+
79
+
80
+ def parse_summary_to_blocks(summary_text: str) -> List[Dict]:
81
+ """
82
+ Parse summary text into structured blocks for better formatting.
83
+ """
84
+ # First, ensure proper formatting
85
+ summary_text = _format_summary_text(summary_text)
86
+
87
  lines = summary_text.strip().splitlines()
88
  blocks = []
89
  current_block = None
 
97
  if not line:
98
  continue
99
 
100
+ # Match section headings (e.g. **Title:** or **Title**)
101
  heading_match = re.match(r'^\*\*(.+?)\*\*:?$', line)
102
  if heading_match:
103
  flush_current_block()
 
106
  current_block = None
107
  continue
108
 
109
+ # Match bullet list items (*, •, or -)
110
+ bullet_match = re.match(r'^[*•-]\s+(.+)', line)
111
+ if bullet_match:
112
  if current_block is None or current_block["type"] != "list" or current_block.get("style") != "bullet":
113
  flush_current_block()
114
  current_block = {"type": "list", "style": "bullet", "items": []}
115
+ current_block["items"].append(bullet_match.group(1).strip())
116
  continue
117
 
118
+ # Match numbered list items
119
+ number_match = re.match(r'^\d+\.\s+(.+)', line)
120
  if number_match:
121
  if current_block is None or current_block["type"] != "list" or current_block.get("style") != "numbered":
122
  flush_current_block()
 
132
 
133
  flush_current_block()
134
 
135
+ # Debug output to help troubleshoot
136
+ logger.info(f"[DEBUG] Parsed {len(blocks)} blocks from summary")
137
+ for i, block in enumerate(blocks):
138
+ if block["type"] == "list":
139
+ logger.info(f"Block {i}: {block['type']} ({block['style']}) with {len(block['items'])} items")
140
+ else:
141
+ logger.info(f"Block {i}: {block['type']}")
142
+
143
  return blocks
144
 
145
+
146
+ def format_summary_for_text_export(summary_text: str) -> str:
147
+ """
148
+ Format summary text specifically for TXT and DOCX exports with proper line breaks.
149
+ """
150
+ formatted_text = _format_summary_text(summary_text)
151
+
152
+ # Add extra spacing for better readability in text formats
153
+ lines = formatted_text.split('\n')
154
+ formatted_lines = []
155
+
156
+ for line in lines:
157
+ line = line.strip()
158
+ if not line:
159
+ continue
160
+
161
+ # Add extra space before section headings
162
+ if re.match(r'^\*\*(.+?)\*\*:?$', line):
163
+ if formatted_lines: # Don't add space before first heading
164
+ formatted_lines.append('')
165
+ formatted_lines.append(line)
166
+ formatted_lines.append('') # Space after heading
167
+ # Add space before bullet points (but group them together)
168
+ elif re.match(r'^[*•-]\s+', line):
169
+ # Check if previous line was also a bullet point
170
+ if formatted_lines and not re.match(r'^[*•-]\s+', formatted_lines[-1]):
171
+ formatted_lines.append('') # Space before first bullet in group
172
+ formatted_lines.append(line)
173
+ else:
174
+ # Regular paragraph
175
+ if formatted_lines:
176
+ formatted_lines.append('')
177
+ formatted_lines.append(line)
178
+
179
+ return '\n'.join(formatted_lines)
multi_llm_chatbot_backend/app/utils/file_export.py CHANGED
@@ -7,6 +7,7 @@ from fastapi.responses import StreamingResponse
7
  from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, ListFlowable, ListItem
8
  from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
9
  from reportlab.lib.enums import TA_CENTER, TA_LEFT
 
10
  from io import BytesIO
11
  import re
12
 
@@ -36,63 +37,101 @@ def generate_docx_file(text: str) -> BytesIO:
36
  return buffer
37
 
38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  def generate_pdf_file(text: str) -> BytesIO:
 
 
 
40
  buffer = BytesIO()
41
- doc = SimpleDocTemplate(buffer, pagesize=letter)
 
 
 
 
 
 
 
42
 
43
  styles = getSampleStyleSheet()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  story = []
45
-
46
- for block in text.split("\n\n"):
47
- story.append(Paragraph(block.strip(), styles["Normal"]))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  story.append(Spacer(1, 12))
49
 
50
  doc.build(story)
51
  buffer.seek(0)
52
  return buffer
53
 
54
- def export_chat_as_file(content: Union[str, List[dict]], format: str) -> Tuple[BytesIO, str, str]:
55
- """
56
- Export either a list of chat messages or a summary string to the specified format.
57
- """
58
- if isinstance(content, list):
59
- text = format_messages_for_export(content)
60
- elif isinstance(content, str):
61
- text = content.strip()
62
- else:
63
- raise ValueError("Unsupported content type")
64
-
65
- if format == "txt":
66
- return generate_txt_file(text), "chat_export.txt", "text/plain"
67
-
68
- elif format == "docx":
69
- return generate_docx_file(text), "chat_export.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
70
-
71
- elif format == "pdf":
72
- return generate_pdf_file(text), "chat_export.pdf", "application/pdf"
73
-
74
- else:
75
- raise ValueError(f"Unsupported export format: {format}")
76
-
77
- def prepare_export_response(
78
- content: Union[str, List[dict]],
79
- format: str,
80
- filename_prefix: str = "chat_export"
81
- ) -> StreamingResponse:
82
- """
83
- Prepare a StreamingResponse for export, using the given filename prefix.
84
- """
85
- stream, filename, media_type = export_chat_as_file(content, format)
86
-
87
- # Replace "chat_export" with custom prefix if needed
88
- final_filename = filename.replace("chat_export", filename_prefix)
89
-
90
- return StreamingResponse(
91
- stream,
92
- media_type=media_type,
93
- headers={"Content-Disposition": f"attachment; filename={final_filename}"}
94
- )
95
-
96
 
97
  def _render_rich_text(text: str) -> str:
98
  """
@@ -158,3 +197,46 @@ def generate_pdf_file_from_blocks(blocks: List[dict]) -> BytesIO:
158
  buffer.seek(0)
159
  return buffer
160
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, ListFlowable, ListItem
8
  from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
9
  from reportlab.lib.enums import TA_CENTER, TA_LEFT
10
+ from reportlab.lib.units import inch
11
  from io import BytesIO
12
  import re
13
 
 
37
  return buffer
38
 
39
 
40
+ def _clean_text_for_pdf(text: str) -> str:
41
+ """
42
+ Clean text for PDF generation to handle special characters and formatting.
43
+ """
44
+ # Remove or replace problematic characters
45
+ text = text.replace('\u2019', "'") # Smart apostrophe
46
+ text = text.replace('\u2018', "'") # Smart apostrophe
47
+ text = text.replace('\u201c', '"') # Smart quote
48
+ text = text.replace('\u201d', '"') # Smart quote
49
+ text = text.replace('\u2013', '-') # En dash
50
+ text = text.replace('\u2014', '-') # Em dash
51
+
52
+ # Handle markdown-style formatting
53
+ text = re.sub(r'\*\*(.+?)\*\*', r'<b>\1</b>', text) # Bold
54
+ text = re.sub(r'\*(.+?)\*', r'<i>\1</i>', text) # Italic
55
+
56
+ return text
57
+
58
+
59
  def generate_pdf_file(text: str) -> BytesIO:
60
+ """
61
+ Improved PDF generation with proper text wrapping and formatting.
62
+ """
63
  buffer = BytesIO()
64
+ doc = SimpleDocTemplate(
65
+ buffer,
66
+ pagesize=letter,
67
+ leftMargin=inch,
68
+ rightMargin=inch,
69
+ topMargin=inch,
70
+ bottomMargin=inch
71
+ )
72
 
73
  styles = getSampleStyleSheet()
74
+
75
+ # Create custom styles for better formatting
76
+ role_style = ParagraphStyle(
77
+ name="RoleStyle",
78
+ parent=styles["Normal"],
79
+ fontSize=12,
80
+ fontName="Helvetica-Bold",
81
+ spaceAfter=6,
82
+ textColor='blue'
83
+ )
84
+
85
+ content_style = ParagraphStyle(
86
+ name="ContentStyle",
87
+ parent=styles["Normal"],
88
+ fontSize=10,
89
+ fontName="Helvetica",
90
+ leading=14, # Line spacing
91
+ spaceAfter=12,
92
+ leftIndent=20
93
+ )
94
+
95
  story = []
96
+
97
+ # Split text into message blocks
98
+ blocks = text.split("\n\n")
99
+
100
+ for block in blocks:
101
+ if not block.strip():
102
+ continue
103
+
104
+ # Clean the text for PDF
105
+ clean_block = _clean_text_for_pdf(block.strip())
106
+
107
+ # Check if this is a role indicator (user:, assistant:, etc.)
108
+ lines = clean_block.split('\n', 1)
109
+ if len(lines) > 1 and lines[0].strip().endswith(':'):
110
+ # This is a role header
111
+ role = lines[0].strip()
112
+ content = lines[1].strip() if len(lines) > 1 else ""
113
+
114
+ # Add role header
115
+ story.append(Paragraph(role, role_style))
116
+
117
+ # Add content if it exists
118
+ if content:
119
+ # Split long content into smaller paragraphs for better formatting
120
+ content_paragraphs = content.split('\n')
121
+ for para in content_paragraphs:
122
+ if para.strip():
123
+ story.append(Paragraph(para.strip(), content_style))
124
+ else:
125
+ # Regular content block
126
+ story.append(Paragraph(clean_block, content_style))
127
+
128
+ # Add some space between message blocks
129
  story.append(Spacer(1, 12))
130
 
131
  doc.build(story)
132
  buffer.seek(0)
133
  return buffer
134
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
 
136
  def _render_rich_text(text: str) -> str:
137
  """
 
197
  buffer.seek(0)
198
  return buffer
199
 
200
+
201
+ def export_chat_as_file(content: Union[str, List[dict]], format: str) -> Tuple[BytesIO, str, str]:
202
+ """
203
+ Export either a list of chat messages or a summary string to the specified format.
204
+ """
205
+ if isinstance(content, list):
206
+ text = format_messages_for_export(content)
207
+ elif isinstance(content, str):
208
+ text = content.strip()
209
+ else:
210
+ raise ValueError("Unsupported content type")
211
+
212
+ if format == "txt":
213
+ return generate_txt_file(text), "chat_export.txt", "text/plain"
214
+
215
+ elif format == "docx":
216
+ return generate_docx_file(text), "chat_export.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
217
+
218
+ elif format == "pdf":
219
+ return generate_pdf_file(text), "chat_export.pdf", "application/pdf"
220
+
221
+ else:
222
+ raise ValueError(f"Unsupported export format: {format}")
223
+
224
+
225
+ def prepare_export_response(
226
+ content: Union[str, List[dict]],
227
+ format: str,
228
+ filename_prefix: str = "chat_export"
229
+ ) -> StreamingResponse:
230
+ """
231
+ Prepare a StreamingResponse for export, using the given filename prefix.
232
+ """
233
+ stream, filename, media_type = export_chat_as_file(content, format)
234
+
235
+ # Replace "chat_export" with custom prefix if needed
236
+ final_filename = filename.replace("chat_export", filename_prefix)
237
+
238
+ return StreamingResponse(
239
+ stream,
240
+ media_type=media_type,
241
+ headers={"Content-Disposition": f"attachment; filename={final_filename}"}
242
+ )