Wajahat698 commited on
Commit
ee37b61
·
verified ·
1 Parent(s): 8aa4386

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -4
app.py CHANGED
@@ -146,9 +146,38 @@ def send_feedback_via_email(name, email, feedback):
146
  st.error(f"Error sending email: {e}")
147
 
148
 
149
- def clean_text(text):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
150
  """Converts HTML content to Markdown format using markdownify."""
151
- return markdownify(text, strip=['img', 'video'])
152
 
153
  def side():
154
  with st.sidebar.form(key='feedback_form'):
@@ -561,8 +590,9 @@ if prompt :
561
  })
562
  full_response = output["output"]
563
  full_response= replace_terms(full_response)
564
-
565
- cleaned_text = clean_text(full_response)
 
566
 
567
 
568
  #cleaned_text = re.sub(r'</span>', '', cleaned_text)
 
146
  st.error(f"Error sending email: {e}")
147
 
148
 
149
+ def clean_html_text(text):
150
+ """Cleans HTML text to preserve basic formatting."""
151
+ soup = BeautifulSoup(text, 'html.parser')
152
+
153
+ # Convert <a> tags to Markdown links
154
+ for a in soup.find_all('a'):
155
+ a.replace_with(f"[{a.get_text()}]({a['href']})")
156
+
157
+ # Remove unwanted tags but preserve their text
158
+ for tag in ['span', 'i', 'b', 'u', 'em', 'strong']:
159
+ for element in soup.find_all(tag):
160
+ element.unwrap() # Remove the tag but keep the content
161
+
162
+ # Handle headings and preserve formatting
163
+ for header in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
164
+ for element in soup.find_all(header):
165
+ level = header[1] # Extract heading level (1-6)
166
+ element.replace_with(f"{'#' * int(level)} {element.get_text()}")
167
+
168
+ # Get the cleaned text
169
+ cleaned_text = soup.get_text()
170
+
171
+ # Maintain paragraph breaks and replace multiple spaces with a single space
172
+ cleaned_text = re.sub(r'\n\s*\n', '\n\n', cleaned_text) # Maintain paragraph breaks
173
+ cleaned_text = re.sub(r'\s+', ' ', cleaned_text) # Replace multiple spaces with a single space
174
+ cleaned_text = cleaned_text.strip() # Remove leading/trailing spaces
175
+
176
+ return cleaned_text
177
+
178
+ def convert_html_to_markdown(html_content):
179
  """Converts HTML content to Markdown format using markdownify."""
180
+ return markdownify(html_content, strip=['img', 'video'])
181
 
182
  def side():
183
  with st.sidebar.form(key='feedback_form'):
 
590
  })
591
  full_response = output["output"]
592
  full_response= replace_terms(full_response)
593
+
594
+ markdown_text = convert_html_to_markdown(full_response)
595
+ cleaned_text = clean_text(markdown_text)
596
 
597
 
598
  #cleaned_text = re.sub(r'</span>', '', cleaned_text)