Copy-AI

Build error

App Files Files Community

Wajahat698 commited on Aug 27, 2024

Commit

17dce32

verified ·

1 Parent(s): efc0e13

Update app.py

Browse files

Files changed (1) hide show

app.py +50 -33

app.py CHANGED Viewed

@@ -146,39 +146,57 @@ def send_feedback_via_email(name, email, feedback):
         st.error(f"Error sending email: {e}")
-def clean_html_text(text):
-    """Cleans HTML text to preserve basic formatting."""
-    soup = BeautifulSoup(text, 'html.parser')
-    # Convert <a> tags to Markdown links
-    for a in soup.find_all('a'):
-        a.replace_with(f"[{a.get_text()}]({a['href']})")
-    # Remove unwanted tags but preserve their text
-    for tag in ['span', 'i', 'b', 'u', 'em', 'strong']:
-        for element in soup.find_all(tag):
-            element.unwrap()  # Remove the tag but keep the content
-    # Handle headings and preserve formatting
-    for header in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
-        for element in soup.find_all(header):
-            level = header[1]  # Extract heading level (1-6)
-            element.replace_with(f"{'#' * int(level)} {element.get_text()}")
-    # Get the cleaned text
-    cleaned_text = soup.get_text()
-    # Maintain paragraph breaks and replace multiple spaces with a single space
-    cleaned_text = re.sub(r'\n\s*\n', '\n\n', cleaned_text)  # Maintain paragraph breaks
-    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)  # Replace multiple spaces with a single space
-    cleaned_text = cleaned_text.strip()  # Remove leading/trailing spaces
     return cleaned_text
-def convert_html_to_markdown(html_content):
-    """Converts HTML content to Markdown format using markdownify."""
-    return markdownify(html_content, strip=['img', 'video'])
 def side():
     with st.sidebar.form(key='feedback_form'):
@@ -591,8 +609,7 @@ if prompt :
                 full_response = output["output"]
                 full_response= replace_terms(full_response)
-                markdown_text = convert_html_to_markdown(full_response)
-                cleaned_text = clean_html_text(markdown_text)
                 #cleaned_text = re.sub(r'</span>', '', cleaned_text)

         st.error(f"Error sending email: {e}")
+def clean_text(text):
+    # Replace escaped newlines with actual newlines
+    text = text.replace('\\n', '\n')
+    # Remove any span and italic tags
+    text = re.sub(r'<span[^>]*>', '', text)
+    text = re.sub(r'</span>', '', text)
+    text = re.sub(r'<i[^>]*>', '', text)
+    text = re.sub(r'</i>', '', text)
+    text = re.sub(r'<span[^>]*>.*?</span>', '', text, flags=re.DOTALL)
+    text = re.sub(r'<span[^>]*>.*?</span>', '', text, flags=re.DOTALL)
+    # Preserve and correctly format markdown links (don't modify URLs)
+    #text = re.sub(r'\[([^\]]+)\]\((https?://[^\)]+)\)', r'\1: \2', text)
+    # Split the text into paragraphs
+    paragraphs = text.split('\n\n')
+    cleaned_paragraphs = []
+    for paragraph in paragraphs:
+        lines = paragraph.split('\n')
+        cleaned_lines = []
+        for line in lines:
+            # Preserve bold formatting for headings
+            if line.strip().startswith('**') and line.strip().endswith('**'):
+                cleaned_line = line.strip()
+            else:
+                # Remove asterisks, special characters, and fix merged text
+                cleaned_line = re.sub(r'\*|\−|\∗', '', line)
+                cleaned_line = re.sub(r'([a-z])([A-Z])', r'\1 \2', cleaned_line)
+            # Handle bullet points
+            if cleaned_line.strip().startswith('-'):
+                cleaned_line = '\n' + cleaned_line.strip()
+            # Remove extra spaces
+            cleaned_line = re.sub(r'\s+', ' ', cleaned_line).strip()
+            cleaned_lines.append(cleaned_line)
+        # Join the lines within each paragraph
+        cleaned_paragraph = '\n'.join(cleaned_lines)
+        cleaned_paragraphs.append(cleaned_paragraph)
+    # Join the paragraphs back together
+    cleaned_text = '\n\n'.join(para for para in cleaned_paragraphs if para)
     return cleaned_text
 def side():
     with st.sidebar.form(key='feedback_form'):
                 full_response = output["output"]
                 full_response= replace_terms(full_response)
+                cleaned_text = clean_text(markdown_text)
                 #cleaned_text = re.sub(r'</span>', '', cleaned_text)