Wajahat698 commited on
Commit
fc8ff97
·
verified ·
1 Parent(s): 4dabaf1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +70 -114
app.py CHANGED
@@ -52,8 +52,7 @@ st.set_page_config(layout="wide")
52
  import logging
53
  import asyncio
54
  import re
55
- from docx import Document as DocxDocument
56
-
57
 
58
  # Set up logging to suppress Streamlit warnings about experimental functions
59
  logging.getLogger('streamlit').setLevel(logging.ERROR)
@@ -111,7 +110,7 @@ def convert_docx_to_md(file):
111
  """
112
  try:
113
  # Read the file
114
- doc = DocxDocument(uploaded_file)
115
  # Extract all text
116
  text = "\n".join([paragraph.text for paragraph in doc.paragraphs])
117
  if not text.strip(): # Handle empty content
@@ -159,36 +158,36 @@ def merge_markdown_contents(contents):
159
  return merged_content
160
 
161
  def upload_to_firebase(user_id, file):
162
- content = convert_file_to_md(file)
 
 
 
163
  if not content:
164
- return None, "Failed to convert file to content."
165
 
166
  doc_id = str(uuid.uuid4())
167
  document_data = {"content": content, "name": file.name}
168
 
169
- # Save to Firebase
170
  db.child("users").child(user_id).child("KnowledgeBase").child(doc_id).set(document_data)
171
 
172
- # Update session state
173
- if "documents" not in st.session_state:
174
- st.session_state["documents"] = {}
175
- st.session_state["documents"][doc_id] = document_data
176
 
177
- # Index the document content
178
  index_document_content(content, doc_id)
179
 
180
- st.sidebar.success(f"Document '{file.name}' uploaded successfully!")
181
- return content,None
 
182
 
183
  def index_document_content(doc_content, doc_id):
184
  """
185
  Indexes the document content by splitting it into chunks and creating embeddings.
186
  """
187
- # Split the document into chunks
188
- text_splitter = RecursiveCharacterTextSplitter(
189
- chunk_size=500,
190
- chunk_overlap=50,
191
- )
192
  texts = text_splitter.split_text(doc_content)
193
 
194
  # Create embeddings for each chunk
@@ -202,6 +201,8 @@ def index_document_content(doc_content, doc_id):
202
  st.session_state["vector_store"][doc_id] = vector_store
203
 
204
 
 
 
205
  def fetch_trustbuilders(user_id):
206
  """
207
  Retrieve TrustBuilders from Firebase for a specific user.
@@ -1040,7 +1041,7 @@ def google_search(query):
1040
  "q": query,
1041
  "sort": "date", # Sort results by date for freshness
1042
  "hl": "en", # Language: English
1043
- "gl": "us", # Geolocation: United States
1044
  }
1045
 
1046
  # Perform the search
@@ -1066,30 +1067,31 @@ def google_search(query):
1066
 
1067
  # RAG response function
1068
  def rag_response(query):
 
 
 
1069
  try:
1070
- # Check if the query references uploaded documents
1071
- if "using uploaded document" in query.lower():
1072
- document_response = handle_document_query(query) # Use your existing `handle_document_query` function
1073
- if document_response:
1074
- return document_response
1075
- else:
1076
-
1077
- # Proceed with the existing knowledge base logic if no uploaded document context is specified
1078
- retrieved_docs = search_knowledge_base(query)
1079
- context = "\n".join(doc.page_content for doc in retrieved_docs)
1080
-
1081
- # Prepare the prompt with the retrieved context
1082
- prompt = f"Context:\n{context}\n\nQuestion: {query}\nAnswer:"
1083
- llm = ChatOpenAI(model="gpt-4o", temperature=0.3, api_key=openai_api_key)
1084
- response = llm.invoke(prompt)
1085
-
1086
- # Replace terms in the final output as per your restrictions
1087
- response_content = response.content
1088
-
1089
- return response_content
1090
  except Exception as e:
1091
  logger.error(f"Error generating RAG response: {e}")
1092
- return "Error occurred during RAG response generation"
 
1093
 
1094
  # Define tools
1095
  @tool
@@ -1107,7 +1109,7 @@ tools = [knowledge_base_tool, google_search_tool]
1107
  prompt_message = f"""
1108
  ** You are a Professional copywriter tasked with creating non-flowery fluid, interconnected marketing content that integrates Trust Builders into various formats for any organization. Your content should be compelling, factual, well-structured, concise, and based on the knowledgebase. Write in an active voice using the first-person perspective ("we"), and avoid the third-person perspective. Creatively interconnect trust-building elements to enhance flow and impact. Avoid using terms like Stability, Development, Competence, Relationship, Benefit, Vision, trust, beacon, beacon of hope, and realm, except where specified.
1109
 
1110
- ### Mandatory Verification Checklist
1111
  Before submitting any content, ensure it includes:
1112
  - **Specific Details**:
1113
  - At least **three specific dollar amounts** with exact figures (e.g., "$127.5 million").
@@ -1121,16 +1123,16 @@ Before submitting any content, ensure it includes:
1121
  - **Each point must be followed by**:
1122
  - "This [specific benefit] for [specific audience]"
1123
  - **Example**: "This reduces wait times by 47% for patients seeking emergency care."
 
 
 
 
 
 
 
 
 
1124
 
1125
- ### Sources and Specificity
1126
- - Include **current and valid source links** next to each trust building point.
1127
- - Replace vague phrases with specific details:
1128
- - "many" → exact number.
1129
- - "millions" → "$127.5 million".
1130
- - "recently" → "March 15, 2023".
1131
- - "global presence" → "offices in 127 cities across 45 countries".
1132
- - "industry leader" → "ranked #1 in customer satisfaction by J.D. Power in 2023".
1133
- - "significant impact" → "47% reduction in processing time".
1134
 
1135
  ### Critical Mandatory Instructions
1136
  - **Avoid Prohibited Terms**: Do not mention "trust," "trust buckets," or category names like Development, Stability, Competence, Relationship, Vision in the copy, except for headings and searches.
@@ -1185,8 +1187,8 @@ Before submitting any content, ensure it includes:
1185
  -Creative Techniques: examples (list only relevant marketing techniques without additional details).
1186
  -Limit to 3-5 items in each category.
1187
 
1188
- ### 5.Trust-Based Queries:**
1189
- Be over specific with numbers,names,dollars, programs ,awards and action.
1190
  - When a query seeks a specific number of trust builders (e.g., "5 trust builders"), the AI should:
1191
  - Randomly pick the requested number of trust buckets from the six available: Development Trust, Competence Trust, Stability Trust, Relationship Trust, Benefit Trust, and Vision Trust.
1192
  - For each selected bucket, find 15 TrustBuilders® points be over specific with numbers,names,dollars, programs ,awards and action.
@@ -1249,6 +1251,9 @@ Before submitting any content, ensure it includes:
1249
  **Organization**
1250
  - In **2023**, World Vision invested **$150 million** in sustainable agriculture programs across **35 countries**, impacting over **2 million** farmers.This improves food security for vulnerable communities.- [Source](#)der each main category, list the trust-building points directly as bullet points or numbered lists **without any additional subheadings, labels, descriptors, phrases, or words before the points**.
1251
 
 
 
 
1252
  ### General Queries
1253
  - Do not use the knowledge base for non-trust content.
1254
  - Always clarify the audience impact and ensure all information is based on verified sources.
@@ -1784,80 +1789,31 @@ def load_user_memory(user_id):
1784
  st.session_state["documents"] = {}
1785
  st.session_state["vector_store"] = {}
1786
 
1787
- def get_document_content(doc_name=None):
1788
- documents = st.session_state.get("documents", {})
1789
- if not documents:
1790
- return None, "No documents have been uploaded."
1791
-
1792
- # If a specific document name is provided
1793
- if doc_name:
1794
- for doc_id, doc_data in documents.items():
1795
- if doc_data.get("name", "").lower() == doc_name.lower():
1796
- content = doc_data.get("content")
1797
- if content:
1798
- return content, None
1799
- else:
1800
- return None, f"Document '{doc_name}' does not contain any content."
1801
- return None, f"Document '{doc_name}' not found."
1802
-
1803
- # Default to the most recent document
1804
- last_doc = list(documents.values())[-1]
1805
- content = last_doc.get("content")
1806
- if content:
1807
- return content, None
1808
- else:
1809
- return None, "The most recently uploaded document does not contain any content."
1810
-
1811
- def handle_document_query(query):
1812
- """
1813
- Handle queries related to uploaded documents for response generation.
1814
- """
1815
- # Extract specific document name if mentioned
1816
- doc_name_match = re.search(r"document\s+'([^']+)'", query, re.IGNORECASE)
1817
- doc_name = doc_name_match.group(1) if doc_name_match else None
1818
-
1819
- # Fetch document content
1820
- doc_content, error = get_document_content(doc_name)
1821
- if error:
1822
- return error
1823
 
1824
- # Generate AI response with document context
1825
- full_prompt = f"Document Content:\n{doc_content}\n\nUser Query: {query}\n\nResponse:"
1826
- try:
1827
- llm = ChatOpenAI(model="gpt-4o", temperature=0.5, api_key=openai_api_key)
1828
- response = llm.invoke(full_prompt)
1829
- return response.content
1830
- except Exception as e:
1831
- return f"Error generating response using the document: {e}"
1832
 
1833
 
1834
-
1835
  def clean_and_format_markdown(raw_text):
1836
  """
1837
- Cleans up formatting issues in dynamically generated text.
1838
- Fixes missing spaces, ensures proper sentence structure, and formats Markdown.
1839
  """
1840
- # Fix missing spaces between words (e.g., "430billiontotheU.S.economy")
1841
- text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text) # Add space between lowercase and uppercase
1842
- text = re.sub(r'(\d)([A-Za-z])', r'\1 \2', text) # Add space between numbers and letters
1843
- text = re.sub(r'([A-Za-z])(\d)', r'\1 \2', text) # Add space between letters and numbers
1844
 
1845
- # Ensure proper Markdown URL formatting
1846
- link_pattern = r'\[([^\]]+)\]\(([^)]+)\)'
1847
  def encode_url(match):
1848
  text = match.group(1)
1849
- url = match.group(2).strip()
1850
- encoded_url = quote(url, safe=':/')
1851
  return f"[{text}]({encoded_url})"
1852
- text = re.sub(link_pattern, encode_url, text)
1853
 
1854
- # Ensure proper sentence spacing and line breaks
1855
- text = re.sub(r'(?<!\n)\n(?!\n)', ' ', text) # Replace single newlines with spaces
1856
- text = re.sub(r'\s+', ' ', text) # Replace multiple spaces with a single space
1857
 
1858
- return text
1859
-
1860
 
 
 
1861
 
1862
  if "missing_trustbucket_content" not in st.session_state:
1863
  st.session_state["missing_trustbucket_content"] = None
@@ -2004,7 +1960,7 @@ def handle_prompt(prompt):
2004
  cleaned_text = ""
2005
  base_instructions = (
2006
  "Avoid flowery language, typical AI phrases, or jargon. "
2007
- "Sources must be the latest, valid, and verifiable ."
2008
  "Strictly dont use trustbucket names in copy headings and content avoid it"
2009
  )
2010
 
 
52
  import logging
53
  import asyncio
54
  import re
55
+ import docx
 
56
 
57
  # Set up logging to suppress Streamlit warnings about experimental functions
58
  logging.getLogger('streamlit').setLevel(logging.ERROR)
 
110
  """
111
  try:
112
  # Read the file
113
+ doc = docx.Document(file)
114
  # Extract all text
115
  text = "\n".join([paragraph.text for paragraph in doc.paragraphs])
116
  if not text.strip(): # Handle empty content
 
158
  return merged_content
159
 
160
  def upload_to_firebase(user_id, file):
161
+ """
162
+ Upload document to Firebase, extract content, and add it to the knowledge base.
163
+ """
164
+ content = convert_file_to_md(file) # Ensure this function extracts content correctly
165
  if not content:
166
+ return None, "Failed to extract content from the file."
167
 
168
  doc_id = str(uuid.uuid4())
169
  document_data = {"content": content, "name": file.name}
170
 
171
+ # Save document to Firebase
172
  db.child("users").child(user_id).child("KnowledgeBase").child(doc_id).set(document_data)
173
 
174
+ # Add content to the knowledge base
175
+ if "knowledge_base" not in st.session_state:
176
+ st.session_state["knowledge_base"] = []
177
+ st.session_state["knowledge_base"].append({"doc_id": doc_id, "content": content})
178
 
179
+ # Index the document content for semantic search
180
  index_document_content(content, doc_id)
181
 
182
+ st.sidebar.success(f"Document '{file.name}' uploaded successfully and added to the knowledge base!")
183
+ return content, None
184
+
185
 
186
  def index_document_content(doc_content, doc_id):
187
  """
188
  Indexes the document content by splitting it into chunks and creating embeddings.
189
  """
190
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
 
 
 
 
191
  texts = text_splitter.split_text(doc_content)
192
 
193
  # Create embeddings for each chunk
 
201
  st.session_state["vector_store"][doc_id] = vector_store
202
 
203
 
204
+
205
+
206
  def fetch_trustbuilders(user_id):
207
  """
208
  Retrieve TrustBuilders from Firebase for a specific user.
 
1041
  "q": query,
1042
  "sort": "date", # Sort results by date for freshness
1043
  "hl": "en", # Language: English
1044
+ "gl": "uk", # Geolocation: United States
1045
  }
1046
 
1047
  # Perform the search
 
1067
 
1068
  # RAG response function
1069
  def rag_response(query):
1070
+ """
1071
+ Handle queries by searching both static and dynamically uploaded knowledge base.
1072
+ """
1073
  try:
1074
+ # Retrieve relevant chunks from the vector store
1075
+ results = []
1076
+ if "vector_store" in st.session_state:
1077
+ for vector_store in st.session_state["vector_store"].values():
1078
+ results.extend(vector_store.similarity_search(query, k=3)) # Adjust `k` for the number of results
1079
+
1080
+ # Combine results into a context
1081
+ context = "\n".join([result.page_content for result in results])
1082
+ if not context:
1083
+ return "No relevant information found in the knowledge base."
1084
+
1085
+ # Generate AI response with the retrieved context
1086
+ prompt = f"Context:\n{context}\n\nQuestion: {query}\nAnswer:"
1087
+ llm = ChatOpenAI(model="gpt-4", temperature=0.3, api_key=openai_api_key)
1088
+ response = llm.invoke(prompt)
1089
+
1090
+ return response.content
 
 
 
1091
  except Exception as e:
1092
  logger.error(f"Error generating RAG response: {e}")
1093
+ return "An error occurred during the RAG response generation process."
1094
+
1095
 
1096
  # Define tools
1097
  @tool
 
1109
  prompt_message = f"""
1110
  ** You are a Professional copywriter tasked with creating non-flowery fluid, interconnected marketing content that integrates Trust Builders into various formats for any organization. Your content should be compelling, factual, well-structured, concise, and based on the knowledgebase. Write in an active voice using the first-person perspective ("we"), and avoid the third-person perspective. Creatively interconnect trust-building elements to enhance flow and impact. Avoid using terms like Stability, Development, Competence, Relationship, Benefit, Vision, trust, beacon, beacon of hope, and realm, except where specified.
1111
 
1112
+ ###Mandatory Verification Checklist
1113
  Before submitting any content, ensure it includes:
1114
  - **Specific Details**:
1115
  - At least **three specific dollar amounts** with exact figures (e.g., "$127.5 million").
 
1123
  - **Each point must be followed by**:
1124
  - "This [specific benefit] for [specific audience]"
1125
  - **Example**: "This reduces wait times by 47% for patients seeking emergency care."
1126
+
1127
+ ## Souces and Specificty
1128
+ Replace vague phrases with specific details:
1129
+ - ❌ "many" → ✅ exact number.
1130
+ - ❌ "millions" → ✅ "$127.5 million".
1131
+ - ❌ "recently" → ✅ "March 15, 2023".
1132
+ - ❌ "global presence" → ✅ "offices in 127 cities across 45 countries".
1133
+ - ❌ "industry leader" → ✅ "ranked #1 in customer satisfaction by J.D. Power in 2023".
1134
+ - ❌ "significant impact" → ✅ "47% reduction in processing time".
1135
 
 
 
 
 
 
 
 
 
 
1136
 
1137
  ### Critical Mandatory Instructions
1138
  - **Avoid Prohibited Terms**: Do not mention "trust," "trust buckets," or category names like Development, Stability, Competence, Relationship, Vision in the copy, except for headings and searches.
 
1187
  -Creative Techniques: examples (list only relevant marketing techniques without additional details).
1188
  -Limit to 3-5 items in each category.
1189
 
1190
+ ### 5.Trust-Based Queries:**
1191
+ ###Be over specific with numbers,names,dollars, programs ,awards and action.
1192
  - When a query seeks a specific number of trust builders (e.g., "5 trust builders"), the AI should:
1193
  - Randomly pick the requested number of trust buckets from the six available: Development Trust, Competence Trust, Stability Trust, Relationship Trust, Benefit Trust, and Vision Trust.
1194
  - For each selected bucket, find 15 TrustBuilders® points be over specific with numbers,names,dollars, programs ,awards and action.
 
1251
  **Organization**
1252
  - In **2023**, World Vision invested **$150 million** in sustainable agriculture programs across **35 countries**, impacting over **2 million** farmers.This improves food security for vulnerable communities.- [Source](#)der each main category, list the trust-building points directly as bullet points or numbered lists **without any additional subheadings, labels, descriptors, phrases, or words before the points**.
1253
 
1254
+ -- **Audience Relevance**:
1255
+ - Each point must be followed by a benefit for a specific audience (e.g., "This reduces wait times by 47% for patients seeking emergency care").
1256
+
1257
  ### General Queries
1258
  - Do not use the knowledge base for non-trust content.
1259
  - Always clarify the audience impact and ensure all information is based on verified sources.
 
1789
  st.session_state["documents"] = {}
1790
  st.session_state["vector_store"] = {}
1791
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1792
 
 
 
 
 
 
 
 
 
1793
 
1794
 
 
1795
  def clean_and_format_markdown(raw_text):
1796
  """
1797
+ Dynamically cleans and formats Markdown text to ensure URLs are properly encoded
1798
+ and handles issues with line breaks or improperly formatted Markdown.
1799
  """
1800
+ # Regular expression to find Markdown links [text](url)
1801
+ pattern = r'\[([^\]]+)\]\(([^)]+)\)'
 
 
1802
 
 
 
1803
  def encode_url(match):
1804
  text = match.group(1)
1805
+ url = match.group(2).strip() # Remove leading/trailing spaces
1806
+ encoded_url = quote(url, safe=':/') # Encode the URL while keeping : and /
1807
  return f"[{text}]({encoded_url})"
 
1808
 
1809
+ # Fix Markdown links dynamically
1810
+ formatted_text = re.sub(pattern, encode_url, raw_text)
 
1811
 
1812
+ # Replace single newlines with spaces to avoid breaking Markdown rendering
1813
+ formatted_text = re.sub(r"(?<!\n)\n(?!\n)", " ", formatted_text)
1814
 
1815
+ return formatted_text
1816
+
1817
 
1818
  if "missing_trustbucket_content" not in st.session_state:
1819
  st.session_state["missing_trustbucket_content"] = None
 
1960
  cleaned_text = ""
1961
  base_instructions = (
1962
  "Avoid flowery language, typical AI phrases, or jargon. "
1963
+ "Sources must be the latest, valid. ."
1964
  "Strictly dont use trustbucket names in copy headings and content avoid it"
1965
  )
1966