Spaces:

trustlogic
/

Live

Sleeping

App Files Files Community

Wajahat698 commited on Nov 22, 2024

Commit

ee1645b

verified ·

1 Parent(s): 0986292

Update app.py

Browse files

Files changed (1) hide show

app.py +40 -20

app.py CHANGED Viewed

@@ -356,44 +356,65 @@ def extract_name(email):
 def clean_text(text):
     """
-    Clean text to remove broken formatting while preserving valid Markdown.
     """
     text = text.replace('\\n', '\n')  # Normalize newlines
-    # Remove all HTML tags (but preserve Markdown formatting)
     text = re.sub(r'<[^>]*>', '', text)
     # Fix spacing issues between numbers and words
     text = re.sub(r'(\d+)\s*(B|M|T|billion|million|trillion)', r'\1 \2', text)
     text = re.sub(r'(\d)\s*([a-zA-Z])', r'\1 \2', text)
     text = re.sub(r'([a-zA-Z])\s*(\d)', r'\1 \2', text)
-    # Preserve Markdown links and formatting
-    text = re.sub(r'\*+', '*', text)  # Remove excess asterisks but keep single or double for Markdown
-    # Split text into paragraphs and clean each paragraph
     paragraphs = text.split('\n\n')
     cleaned_paragraphs = []
     for paragraph in paragraphs:
         lines = paragraph.split('\n')
         cleaned_lines = []
         for line in lines:
-            # Handle bullet points properly
-            if line.strip().startswith('-'):
                 cleaned_line = line.strip()
             else:
-                # Preserve valid Markdown formatting
-                cleaned_line = re.sub(r'[^\w\s\*_\[\]\(\)-]', '', line)
                 cleaned_line = re.sub(r'([a-z])([A-Z])', r'\1 \2', cleaned_line)
             cleaned_lines.append(cleaned_line)
-        # Join lines back into paragraphs
         cleaned_paragraph = '\n'.join(cleaned_lines)
         cleaned_paragraphs.append(cleaned_paragraph)
-    # Join all paragraphs together
-    cleaned_text = '\n\n'.join(cleaned_paragraphs)
     return cleaned_text
@@ -1135,12 +1156,11 @@ Before submitting any content, ensure it includes:
 #### Report/Article/Write-up/ Blog
 - **Introduction**: "Here is a draft of your [Annual Report/Article/Write-up]. Feel free to suggest further refinements."
-- **Content**:
   - Give headlines  conversational headings to structure content . Do not include source links within the content.
   - Write from the perspective of being part of the organization, using "we".
   - Maintain an active, engaging, and direct tone.
-  **Donot give source link in content**
 #### Social Media Posts
 - **Introduction Line**: "Here is a draft of your social media post. Feel free to suggest further refinements."
 - **Content**:
@@ -1817,7 +1837,7 @@ def handle_document_query(query):
     # Generate AI response with document context
     full_prompt = f"Document Content:\n{doc_content}\n\nUser Query: {query}\n\nResponse:"
     try:
-        llm = ChatOpenAI(model="gpt-4", temperature=0.5, api_key=openai_api_key)
         response = llm.invoke(full_prompt)
         return response.content
     except Exception as e:
@@ -2008,7 +2028,7 @@ def handle_prompt(prompt):
                               "    Use the following structure:"
                               "      -Heuristics: examples (e.g., social proof, authority, commitment)."
                               "    -Creative Techniques: examples (list only relevant marketing techniques without additional details)."
-                            "The final output must not include AI jargon or marketing buzzwords (donot provide source in content) as instruicted and Give well title and sub-headlines. Strictly interconnected sections having Flowing narrative and audience engagement at its peak to create an impactful and memorable experience.Avoid mentioning trustbucket names."
                         )
                         else:
                             appended_instructions = ""

 def clean_text(text):
     """
+    Clean text to remove broken formatting, fix spacing issues,
+    handle bullet points, and encode URLs in Markdown links.
     """
     text = text.replace('\\n', '\n')  # Normalize newlines
+    # Remove all HTML tags
     text = re.sub(r'<[^>]*>', '', text)
+    # Regex to find all Markdown-style links and encode the URLs
+    markdown_link_pattern = r'\[([^\]]+)\]\((https?://[^\s]+)\)'
+    def encode_url(match):
+        """
+        Helper function to encode the URL in a Markdown link.
+        """
+        link_text = match.group(1)  # Text inside []
+        url = match.group(2)  # URL inside ()
+        encoded_url = quote(url, safe=":/?=&")  # Encode URL but keep essential characters
+        return f"[{link_text}]({encoded_url})"
+    # Encode all URLs in Markdown links
+    text = re.sub(markdown_link_pattern, encode_url, text)
     # Fix spacing issues between numbers and words
     text = re.sub(r'(\d+)\s*(B|M|T|billion|million|trillion)', r'\1 \2', text)
     text = re.sub(r'(\d)\s*([a-zA-Z])', r'\1 \2', text)
     text = re.sub(r'([a-zA-Z])\s*(\d)', r'\1 \2', text)
+    # Split the text into paragraphs
     paragraphs = text.split('\n\n')
     cleaned_paragraphs = []
     for paragraph in paragraphs:
         lines = paragraph.split('\n')
         cleaned_lines = []
         for line in lines:
+            # Preserve bold formatting for headings
+            if line.strip().startswith('**') and line.strip().endswith('**'):
                 cleaned_line = line.strip()
             else:
+                # Remove asterisks and special characters while preserving valid Markdown
+                cleaned_line = re.sub(r'\*|\−|\∗', '', line)
                 cleaned_line = re.sub(r'([a-z])([A-Z])', r'\1 \2', cleaned_line)
+            # Handle bullet points
+            if cleaned_line.strip().startswith('-'):
+                cleaned_line = '\n' + cleaned_line.strip()
+            # Remove extra spaces
+            cleaned_line = re.sub(r'\s+', ' ', cleaned_line).strip()
             cleaned_lines.append(cleaned_line)
+        # Join the lines within each paragraph
         cleaned_paragraph = '\n'.join(cleaned_lines)
         cleaned_paragraphs.append(cleaned_paragraph)
+    # Join the paragraphs back together
+    cleaned_text = '\n\n'.join(para for para in cleaned_paragraphs if para)
     return cleaned_text
 #### Report/Article/Write-up/ Blog
 - **Introduction**: "Here is a draft of your [Annual Report/Article/Write-up]. Feel free to suggest further refinements."
+- **Content**: **Donot give source link in content**
   - Give headlines  conversational headings to structure content . Do not include source links within the content.
   - Write from the perspective of being part of the organization, using "we".
   - Maintain an active, engaging, and direct tone.
 #### Social Media Posts
 - **Introduction Line**: "Here is a draft of your social media post. Feel free to suggest further refinements."
 - **Content**:
     # Generate AI response with document context
     full_prompt = f"Document Content:\n{doc_content}\n\nUser Query: {query}\n\nResponse:"
     try:
+        llm = ChatOpenAI(model="gpt-4o", temperature=0.5, api_key=openai_api_key)
         response = llm.invoke(full_prompt)
         return response.content
     except Exception as e:
                               "    Use the following structure:"
                               "      -Heuristics: examples (e.g., social proof, authority, commitment)."
                               "    -Creative Techniques: examples (list only relevant marketing techniques without additional details)."
+                            "The final output must not include AI jargon or marketing buzzwords and Give well title and 2-3 sub-headlines. Strictly interconnected sections having Flowing narrative and audience engagement at its peak to create an impactful and memorable experience.Avoid mentioning trustbucket names."
                         )
                         else:
                             appended_instructions = ""