Copy-AI

Build error

Wajahat698 commited on Aug 27, 2024

Commit

6d91132

verified ·

1 Parent(s): 5be33ea

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -149,16 +149,20 @@ def clean_text(text):
     # Replace escaped newlines with actual newlines
     text = text.replace('\\n', '\n')
-    # Remove any span and italic tags
-    text = re.sub(r'<span[^>]*>', '', text)
-    text = re.sub(r'</span>', '', text)
-    text = re.sub(r'<i[^>]*>', '', text)
-    text = re.sub(r'</i>', '', text)
-    # Preserve and correctly format markdown links (don't modify URLs)
-    #text = re.sub(r'\[([^\]]+)\]\((https?://[^\)]+)\)', r'\1: \2', text)
     # Split the text into paragraphs
     paragraphs = text.split('\n\n')
@@ -168,7 +172,7 @@ def clean_text(text):
         lines = paragraph.split('\n')
         cleaned_lines = []
         for line in lines:
-            # Preserve bold formatting for headings
             if line.strip().startswith('**') and line.strip().endswith('**'):
                 cleaned_line = line.strip()
             else:
@@ -176,7 +180,7 @@ def clean_text(text):
                 cleaned_line = re.sub(r'\*|\−|\∗', '', line)
                 cleaned_line = re.sub(r'([a-z])([A-Z])', r'\1 \2', cleaned_line)
-            # Handle bullet points
             if cleaned_line.strip().startswith('-'):
                 cleaned_line = '\n' + cleaned_line.strip()

     # Replace escaped newlines with actual newlines
     text = text.replace('\\n', '\n')
+    # Convert <a> tags to Markdown links
+    def convert_links(match):
+        url = match.group(1)
+        link_text = match.group(2)
+        return f"[{link_text}]({url})"
+    # Handle <a> tags to preserve clickable URLs in Markdown format
+    text = re.sub(r'<a [^>]*href="([^"]+)"[^>]*>(.*?)</a>', convert_links, text)
+    # Remove <span>, <i>, <b>, and other unwanted HTML tags
+    text = re.sub(r'<span[^>]*>|</span>|<i[^>]*>|</i>|<b[^>]*>|</b>', '', text)
+    # Remove any remaining HTML tags
+    text = re.sub(r'<[^>]+>', '', text)
     # Split the text into paragraphs
     paragraphs = text.split('\n\n')
         lines = paragraph.split('\n')
         cleaned_lines = []
         for line in lines:
+            # Preserve and correctly format headings or bold text
             if line.strip().startswith('**') and line.strip().endswith('**'):
                 cleaned_line = line.strip()
             else:
                 cleaned_line = re.sub(r'\*|\−|\∗', '', line)
                 cleaned_line = re.sub(r'([a-z])([A-Z])', r'\1 \2', cleaned_line)
+            # Handle bullet points correctly
             if cleaned_line.strip().startswith('-'):
                 cleaned_line = '\n' + cleaned_line.strip()