Wajahat698 commited on
Commit
ee1645b
·
verified ·
1 Parent(s): 0986292

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -20
app.py CHANGED
@@ -356,44 +356,65 @@ def extract_name(email):
356
 
357
  def clean_text(text):
358
  """
359
- Clean text to remove broken formatting while preserving valid Markdown.
 
360
  """
361
  text = text.replace('\\n', '\n') # Normalize newlines
362
 
363
- # Remove all HTML tags (but preserve Markdown formatting)
364
  text = re.sub(r'<[^>]*>', '', text)
 
 
 
365
 
 
 
 
 
 
 
 
 
 
 
 
 
366
  # Fix spacing issues between numbers and words
367
  text = re.sub(r'(\d+)\s*(B|M|T|billion|million|trillion)', r'\1 \2', text)
368
  text = re.sub(r'(\d)\s*([a-zA-Z])', r'\1 \2', text)
369
  text = re.sub(r'([a-zA-Z])\s*(\d)', r'\1 \2', text)
370
 
371
- # Preserve Markdown links and formatting
372
- text = re.sub(r'\*+', '*', text) # Remove excess asterisks but keep single or double for Markdown
373
-
374
- # Split text into paragraphs and clean each paragraph
375
  paragraphs = text.split('\n\n')
376
  cleaned_paragraphs = []
 
377
  for paragraph in paragraphs:
378
  lines = paragraph.split('\n')
379
  cleaned_lines = []
 
380
  for line in lines:
381
- # Handle bullet points properly
382
- if line.strip().startswith('-'):
383
  cleaned_line = line.strip()
384
  else:
385
- # Preserve valid Markdown formatting
386
- cleaned_line = re.sub(r'[^\w\s\*_\[\]\(\)-]', '', line)
387
  cleaned_line = re.sub(r'([a-z])([A-Z])', r'\1 \2', cleaned_line)
388
 
 
 
 
 
 
 
389
  cleaned_lines.append(cleaned_line)
390
-
391
- # Join lines back into paragraphs
392
  cleaned_paragraph = '\n'.join(cleaned_lines)
393
  cleaned_paragraphs.append(cleaned_paragraph)
394
-
395
- # Join all paragraphs together
396
- cleaned_text = '\n\n'.join(cleaned_paragraphs)
397
  return cleaned_text
398
 
399
 
@@ -1135,12 +1156,11 @@ Before submitting any content, ensure it includes:
1135
 
1136
  #### Report/Article/Write-up/ Blog
1137
  - **Introduction**: "Here is a draft of your [Annual Report/Article/Write-up]. Feel free to suggest further refinements."
1138
- - **Content**:
1139
  - Give headlines conversational headings to structure content . Do not include source links within the content.
1140
  - Write from the perspective of being part of the organization, using "we".
1141
  - Maintain an active, engaging, and direct tone.
1142
- **Donot give source link in content**
1143
-
1144
  #### Social Media Posts
1145
  - **Introduction Line**: "Here is a draft of your social media post. Feel free to suggest further refinements."
1146
  - **Content**:
@@ -1817,7 +1837,7 @@ def handle_document_query(query):
1817
  # Generate AI response with document context
1818
  full_prompt = f"Document Content:\n{doc_content}\n\nUser Query: {query}\n\nResponse:"
1819
  try:
1820
- llm = ChatOpenAI(model="gpt-4", temperature=0.5, api_key=openai_api_key)
1821
  response = llm.invoke(full_prompt)
1822
  return response.content
1823
  except Exception as e:
@@ -2008,7 +2028,7 @@ def handle_prompt(prompt):
2008
  " Use the following structure:"
2009
  " -Heuristics: examples (e.g., social proof, authority, commitment)."
2010
  " -Creative Techniques: examples (list only relevant marketing techniques without additional details)."
2011
- "The final output must not include AI jargon or marketing buzzwords (donot provide source in content) as instruicted and Give well title and sub-headlines. Strictly interconnected sections having Flowing narrative and audience engagement at its peak to create an impactful and memorable experience.Avoid mentioning trustbucket names."
2012
  )
2013
  else:
2014
  appended_instructions = ""
 
356
 
357
  def clean_text(text):
358
  """
359
+ Clean text to remove broken formatting, fix spacing issues,
360
+ handle bullet points, and encode URLs in Markdown links.
361
  """
362
  text = text.replace('\\n', '\n') # Normalize newlines
363
 
364
+ # Remove all HTML tags
365
  text = re.sub(r'<[^>]*>', '', text)
366
+
367
+ # Regex to find all Markdown-style links and encode the URLs
368
+ markdown_link_pattern = r'\[([^\]]+)\]\((https?://[^\s]+)\)'
369
 
370
+ def encode_url(match):
371
+ """
372
+ Helper function to encode the URL in a Markdown link.
373
+ """
374
+ link_text = match.group(1) # Text inside []
375
+ url = match.group(2) # URL inside ()
376
+ encoded_url = quote(url, safe=":/?=&") # Encode URL but keep essential characters
377
+ return f"[{link_text}]({encoded_url})"
378
+
379
+ # Encode all URLs in Markdown links
380
+ text = re.sub(markdown_link_pattern, encode_url, text)
381
+
382
  # Fix spacing issues between numbers and words
383
  text = re.sub(r'(\d+)\s*(B|M|T|billion|million|trillion)', r'\1 \2', text)
384
  text = re.sub(r'(\d)\s*([a-zA-Z])', r'\1 \2', text)
385
  text = re.sub(r'([a-zA-Z])\s*(\d)', r'\1 \2', text)
386
 
387
+ # Split the text into paragraphs
 
 
 
388
  paragraphs = text.split('\n\n')
389
  cleaned_paragraphs = []
390
+
391
  for paragraph in paragraphs:
392
  lines = paragraph.split('\n')
393
  cleaned_lines = []
394
+
395
  for line in lines:
396
+ # Preserve bold formatting for headings
397
+ if line.strip().startswith('**') and line.strip().endswith('**'):
398
  cleaned_line = line.strip()
399
  else:
400
+ # Remove asterisks and special characters while preserving valid Markdown
401
+ cleaned_line = re.sub(r'\*|\−|\', '', line)
402
  cleaned_line = re.sub(r'([a-z])([A-Z])', r'\1 \2', cleaned_line)
403
 
404
+ # Handle bullet points
405
+ if cleaned_line.strip().startswith('-'):
406
+ cleaned_line = '\n' + cleaned_line.strip()
407
+
408
+ # Remove extra spaces
409
+ cleaned_line = re.sub(r'\s+', ' ', cleaned_line).strip()
410
  cleaned_lines.append(cleaned_line)
411
+
412
+ # Join the lines within each paragraph
413
  cleaned_paragraph = '\n'.join(cleaned_lines)
414
  cleaned_paragraphs.append(cleaned_paragraph)
415
+
416
+ # Join the paragraphs back together
417
+ cleaned_text = '\n\n'.join(para for para in cleaned_paragraphs if para)
418
  return cleaned_text
419
 
420
 
 
1156
 
1157
  #### Report/Article/Write-up/ Blog
1158
  - **Introduction**: "Here is a draft of your [Annual Report/Article/Write-up]. Feel free to suggest further refinements."
1159
+ - **Content**: **Donot give source link in content**
1160
  - Give headlines conversational headings to structure content . Do not include source links within the content.
1161
  - Write from the perspective of being part of the organization, using "we".
1162
  - Maintain an active, engaging, and direct tone.
1163
+
 
1164
  #### Social Media Posts
1165
  - **Introduction Line**: "Here is a draft of your social media post. Feel free to suggest further refinements."
1166
  - **Content**:
 
1837
  # Generate AI response with document context
1838
  full_prompt = f"Document Content:\n{doc_content}\n\nUser Query: {query}\n\nResponse:"
1839
  try:
1840
+ llm = ChatOpenAI(model="gpt-4o", temperature=0.5, api_key=openai_api_key)
1841
  response = llm.invoke(full_prompt)
1842
  return response.content
1843
  except Exception as e:
 
2028
  " Use the following structure:"
2029
  " -Heuristics: examples (e.g., social proof, authority, commitment)."
2030
  " -Creative Techniques: examples (list only relevant marketing techniques without additional details)."
2031
+ "The final output must not include AI jargon or marketing buzzwords and Give well title and 2-3 sub-headlines. Strictly interconnected sections having Flowing narrative and audience engagement at its peak to create an impactful and memorable experience.Avoid mentioning trustbucket names."
2032
  )
2033
  else:
2034
  appended_instructions = ""