Wajahat698 commited on
Commit
17dce32
·
verified ·
1 Parent(s): efc0e13

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +50 -33
app.py CHANGED
@@ -146,39 +146,57 @@ def send_feedback_via_email(name, email, feedback):
146
  st.error(f"Error sending email: {e}")
147
 
148
 
149
- def clean_html_text(text):
150
- """Cleans HTML text to preserve basic formatting."""
151
- soup = BeautifulSoup(text, 'html.parser')
152
-
153
- # Convert <a> tags to Markdown links
154
- for a in soup.find_all('a'):
155
- a.replace_with(f"[{a.get_text()}]({a['href']})")
156
-
157
- # Remove unwanted tags but preserve their text
158
- for tag in ['span', 'i', 'b', 'u', 'em', 'strong']:
159
- for element in soup.find_all(tag):
160
- element.unwrap() # Remove the tag but keep the content
161
-
162
- # Handle headings and preserve formatting
163
- for header in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
164
- for element in soup.find_all(header):
165
- level = header[1] # Extract heading level (1-6)
166
- element.replace_with(f"{'#' * int(level)} {element.get_text()}")
167
-
168
- # Get the cleaned text
169
- cleaned_text = soup.get_text()
170
-
171
- # Maintain paragraph breaks and replace multiple spaces with a single space
172
- cleaned_text = re.sub(r'\n\s*\n', '\n\n', cleaned_text) # Maintain paragraph breaks
173
- cleaned_text = re.sub(r'\s+', ' ', cleaned_text) # Replace multiple spaces with a single space
174
- cleaned_text = cleaned_text.strip() # Remove leading/trailing spaces
175
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
176
  return cleaned_text
177
 
178
- def convert_html_to_markdown(html_content):
179
- """Converts HTML content to Markdown format using markdownify."""
180
- return markdownify(html_content, strip=['img', 'video'])
181
-
182
  def side():
183
  with st.sidebar.form(key='feedback_form'):
184
 
@@ -591,8 +609,7 @@ if prompt :
591
  full_response = output["output"]
592
  full_response= replace_terms(full_response)
593
 
594
- markdown_text = convert_html_to_markdown(full_response)
595
- cleaned_text = clean_html_text(markdown_text)
596
 
597
 
598
  #cleaned_text = re.sub(r'</span>', '', cleaned_text)
 
146
  st.error(f"Error sending email: {e}")
147
 
148
 
149
+ def clean_text(text):
150
+ # Replace escaped newlines with actual newlines
151
+ text = text.replace('\\n', '\n')
152
+
153
+ # Remove any span and italic tags
154
+ text = re.sub(r'<span[^>]*>', '', text)
155
+ text = re.sub(r'</span>', '', text)
156
+ text = re.sub(r'<i[^>]*>', '', text)
157
+ text = re.sub(r'</i>', '', text)
158
+ text = re.sub(r'<span[^>]*>.*?</span>', '', text, flags=re.DOTALL)
159
+ text = re.sub(r'<span[^>]*>.*?</span>', '', text, flags=re.DOTALL)
160
+
161
+
162
+
163
+ # Preserve and correctly format markdown links (don't modify URLs)
164
+ #text = re.sub(r'\[([^\]]+)\]\((https?://[^\)]+)\)', r'\1: \2', text)
165
+
166
+ # Split the text into paragraphs
167
+ paragraphs = text.split('\n\n')
168
+
169
+ cleaned_paragraphs = []
170
+ for paragraph in paragraphs:
171
+ lines = paragraph.split('\n')
172
+ cleaned_lines = []
173
+ for line in lines:
174
+ # Preserve bold formatting for headings
175
+ if line.strip().startswith('**') and line.strip().endswith('**'):
176
+ cleaned_line = line.strip()
177
+ else:
178
+ # Remove asterisks, special characters, and fix merged text
179
+ cleaned_line = re.sub(r'\*|\−|\∗', '', line)
180
+ cleaned_line = re.sub(r'([a-z])([A-Z])', r'\1 \2', cleaned_line)
181
+
182
+ # Handle bullet points
183
+ if cleaned_line.strip().startswith('-'):
184
+ cleaned_line = '\n' + cleaned_line.strip()
185
+
186
+ # Remove extra spaces
187
+ cleaned_line = re.sub(r'\s+', ' ', cleaned_line).strip()
188
+ cleaned_lines.append(cleaned_line)
189
+
190
+ # Join the lines within each paragraph
191
+ cleaned_paragraph = '\n'.join(cleaned_lines)
192
+ cleaned_paragraphs.append(cleaned_paragraph)
193
+
194
+ # Join the paragraphs back together
195
+ cleaned_text = '\n\n'.join(para for para in cleaned_paragraphs if para)
196
+
197
  return cleaned_text
198
 
199
+
 
 
 
200
  def side():
201
  with st.sidebar.form(key='feedback_form'):
202
 
 
609
  full_response = output["output"]
610
  full_response= replace_terms(full_response)
611
 
612
+ cleaned_text = clean_text(markdown_text)
 
613
 
614
 
615
  #cleaned_text = re.sub(r'</span>', '', cleaned_text)