Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -52,8 +52,7 @@ st.set_page_config(layout="wide")
|
|
| 52 |
import logging
|
| 53 |
import asyncio
|
| 54 |
import re
|
| 55 |
-
|
| 56 |
-
|
| 57 |
|
| 58 |
# Set up logging to suppress Streamlit warnings about experimental functions
|
| 59 |
logging.getLogger('streamlit').setLevel(logging.ERROR)
|
|
@@ -111,7 +110,7 @@ def convert_docx_to_md(file):
|
|
| 111 |
"""
|
| 112 |
try:
|
| 113 |
# Read the file
|
| 114 |
-
doc =
|
| 115 |
# Extract all text
|
| 116 |
text = "\n".join([paragraph.text for paragraph in doc.paragraphs])
|
| 117 |
if not text.strip(): # Handle empty content
|
|
@@ -159,36 +158,36 @@ def merge_markdown_contents(contents):
|
|
| 159 |
return merged_content
|
| 160 |
|
| 161 |
def upload_to_firebase(user_id, file):
|
| 162 |
-
|
|
|
|
|
|
|
|
|
|
| 163 |
if not content:
|
| 164 |
-
return None, "Failed to
|
| 165 |
|
| 166 |
doc_id = str(uuid.uuid4())
|
| 167 |
document_data = {"content": content, "name": file.name}
|
| 168 |
|
| 169 |
-
# Save to Firebase
|
| 170 |
db.child("users").child(user_id).child("KnowledgeBase").child(doc_id).set(document_data)
|
| 171 |
|
| 172 |
-
#
|
| 173 |
-
if "
|
| 174 |
-
st.session_state["
|
| 175 |
-
st.session_state["
|
| 176 |
|
| 177 |
-
# Index the document content
|
| 178 |
index_document_content(content, doc_id)
|
| 179 |
|
| 180 |
-
st.sidebar.success(f"Document '{file.name}' uploaded
|
| 181 |
-
return content,None
|
|
|
|
| 182 |
|
| 183 |
def index_document_content(doc_content, doc_id):
|
| 184 |
"""
|
| 185 |
Indexes the document content by splitting it into chunks and creating embeddings.
|
| 186 |
"""
|
| 187 |
-
|
| 188 |
-
text_splitter = RecursiveCharacterTextSplitter(
|
| 189 |
-
chunk_size=500,
|
| 190 |
-
chunk_overlap=50,
|
| 191 |
-
)
|
| 192 |
texts = text_splitter.split_text(doc_content)
|
| 193 |
|
| 194 |
# Create embeddings for each chunk
|
|
@@ -202,6 +201,8 @@ def index_document_content(doc_content, doc_id):
|
|
| 202 |
st.session_state["vector_store"][doc_id] = vector_store
|
| 203 |
|
| 204 |
|
|
|
|
|
|
|
| 205 |
def fetch_trustbuilders(user_id):
|
| 206 |
"""
|
| 207 |
Retrieve TrustBuilders from Firebase for a specific user.
|
|
@@ -1040,7 +1041,7 @@ def google_search(query):
|
|
| 1040 |
"q": query,
|
| 1041 |
"sort": "date", # Sort results by date for freshness
|
| 1042 |
"hl": "en", # Language: English
|
| 1043 |
-
"gl": "
|
| 1044 |
}
|
| 1045 |
|
| 1046 |
# Perform the search
|
|
@@ -1066,30 +1067,31 @@ def google_search(query):
|
|
| 1066 |
|
| 1067 |
# RAG response function
|
| 1068 |
def rag_response(query):
|
|
|
|
|
|
|
|
|
|
| 1069 |
try:
|
| 1070 |
-
#
|
| 1071 |
-
|
| 1072 |
-
|
| 1073 |
-
|
| 1074 |
-
|
| 1075 |
-
|
| 1076 |
-
|
| 1077 |
-
|
| 1078 |
-
|
| 1079 |
-
|
| 1080 |
-
|
| 1081 |
-
|
| 1082 |
-
|
| 1083 |
-
|
| 1084 |
-
|
| 1085 |
-
|
| 1086 |
-
|
| 1087 |
-
response_content = response.content
|
| 1088 |
-
|
| 1089 |
-
return response_content
|
| 1090 |
except Exception as e:
|
| 1091 |
logger.error(f"Error generating RAG response: {e}")
|
| 1092 |
-
return "
|
|
|
|
| 1093 |
|
| 1094 |
# Define tools
|
| 1095 |
@tool
|
|
@@ -1107,7 +1109,7 @@ tools = [knowledge_base_tool, google_search_tool]
|
|
| 1107 |
prompt_message = f"""
|
| 1108 |
** You are a Professional copywriter tasked with creating non-flowery fluid, interconnected marketing content that integrates Trust Builders into various formats for any organization. Your content should be compelling, factual, well-structured, concise, and based on the knowledgebase. Write in an active voice using the first-person perspective ("we"), and avoid the third-person perspective. Creatively interconnect trust-building elements to enhance flow and impact. Avoid using terms like Stability, Development, Competence, Relationship, Benefit, Vision, trust, beacon, beacon of hope, and realm, except where specified.
|
| 1109 |
|
| 1110 |
-
###
|
| 1111 |
Before submitting any content, ensure it includes:
|
| 1112 |
- **Specific Details**:
|
| 1113 |
- At least **three specific dollar amounts** with exact figures (e.g., "$127.5 million").
|
|
@@ -1121,16 +1123,16 @@ Before submitting any content, ensure it includes:
|
|
| 1121 |
- **Each point must be followed by**:
|
| 1122 |
- "This [specific benefit] for [specific audience]"
|
| 1123 |
- **Example**: "This reduces wait times by 47% for patients seeking emergency care."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1124 |
|
| 1125 |
-
### Sources and Specificity
|
| 1126 |
-
- Include **current and valid source links** next to each trust building point.
|
| 1127 |
-
- Replace vague phrases with specific details:
|
| 1128 |
-
- "many" → exact number.
|
| 1129 |
-
- "millions" → "$127.5 million".
|
| 1130 |
-
- "recently" → "March 15, 2023".
|
| 1131 |
-
- "global presence" → "offices in 127 cities across 45 countries".
|
| 1132 |
-
- "industry leader" → "ranked #1 in customer satisfaction by J.D. Power in 2023".
|
| 1133 |
-
- "significant impact" → "47% reduction in processing time".
|
| 1134 |
|
| 1135 |
### Critical Mandatory Instructions
|
| 1136 |
- **Avoid Prohibited Terms**: Do not mention "trust," "trust buckets," or category names like Development, Stability, Competence, Relationship, Vision in the copy, except for headings and searches.
|
|
@@ -1185,8 +1187,8 @@ Before submitting any content, ensure it includes:
|
|
| 1185 |
-Creative Techniques: examples (list only relevant marketing techniques without additional details).
|
| 1186 |
-Limit to 3-5 items in each category.
|
| 1187 |
|
| 1188 |
-
### 5.Trust-Based Queries:**
|
| 1189 |
-
|
| 1190 |
- When a query seeks a specific number of trust builders (e.g., "5 trust builders"), the AI should:
|
| 1191 |
- Randomly pick the requested number of trust buckets from the six available: Development Trust, Competence Trust, Stability Trust, Relationship Trust, Benefit Trust, and Vision Trust.
|
| 1192 |
- For each selected bucket, find 15 TrustBuilders® points be over specific with numbers,names,dollars, programs ,awards and action.
|
|
@@ -1249,6 +1251,9 @@ Before submitting any content, ensure it includes:
|
|
| 1249 |
**Organization**
|
| 1250 |
- In **2023**, World Vision invested **$150 million** in sustainable agriculture programs across **35 countries**, impacting over **2 million** farmers.This improves food security for vulnerable communities.- [Source](#)der each main category, list the trust-building points directly as bullet points or numbered lists **without any additional subheadings, labels, descriptors, phrases, or words before the points**.
|
| 1251 |
|
|
|
|
|
|
|
|
|
|
| 1252 |
### General Queries
|
| 1253 |
- Do not use the knowledge base for non-trust content.
|
| 1254 |
- Always clarify the audience impact and ensure all information is based on verified sources.
|
|
@@ -1784,80 +1789,31 @@ def load_user_memory(user_id):
|
|
| 1784 |
st.session_state["documents"] = {}
|
| 1785 |
st.session_state["vector_store"] = {}
|
| 1786 |
|
| 1787 |
-
def get_document_content(doc_name=None):
|
| 1788 |
-
documents = st.session_state.get("documents", {})
|
| 1789 |
-
if not documents:
|
| 1790 |
-
return None, "No documents have been uploaded."
|
| 1791 |
-
|
| 1792 |
-
# If a specific document name is provided
|
| 1793 |
-
if doc_name:
|
| 1794 |
-
for doc_id, doc_data in documents.items():
|
| 1795 |
-
if doc_data.get("name", "").lower() == doc_name.lower():
|
| 1796 |
-
content = doc_data.get("content")
|
| 1797 |
-
if content:
|
| 1798 |
-
return content, None
|
| 1799 |
-
else:
|
| 1800 |
-
return None, f"Document '{doc_name}' does not contain any content."
|
| 1801 |
-
return None, f"Document '{doc_name}' not found."
|
| 1802 |
-
|
| 1803 |
-
# Default to the most recent document
|
| 1804 |
-
last_doc = list(documents.values())[-1]
|
| 1805 |
-
content = last_doc.get("content")
|
| 1806 |
-
if content:
|
| 1807 |
-
return content, None
|
| 1808 |
-
else:
|
| 1809 |
-
return None, "The most recently uploaded document does not contain any content."
|
| 1810 |
-
|
| 1811 |
-
def handle_document_query(query):
|
| 1812 |
-
"""
|
| 1813 |
-
Handle queries related to uploaded documents for response generation.
|
| 1814 |
-
"""
|
| 1815 |
-
# Extract specific document name if mentioned
|
| 1816 |
-
doc_name_match = re.search(r"document\s+'([^']+)'", query, re.IGNORECASE)
|
| 1817 |
-
doc_name = doc_name_match.group(1) if doc_name_match else None
|
| 1818 |
-
|
| 1819 |
-
# Fetch document content
|
| 1820 |
-
doc_content, error = get_document_content(doc_name)
|
| 1821 |
-
if error:
|
| 1822 |
-
return error
|
| 1823 |
|
| 1824 |
-
# Generate AI response with document context
|
| 1825 |
-
full_prompt = f"Document Content:\n{doc_content}\n\nUser Query: {query}\n\nResponse:"
|
| 1826 |
-
try:
|
| 1827 |
-
llm = ChatOpenAI(model="gpt-4o", temperature=0.5, api_key=openai_api_key)
|
| 1828 |
-
response = llm.invoke(full_prompt)
|
| 1829 |
-
return response.content
|
| 1830 |
-
except Exception as e:
|
| 1831 |
-
return f"Error generating response using the document: {e}"
|
| 1832 |
|
| 1833 |
|
| 1834 |
-
|
| 1835 |
def clean_and_format_markdown(raw_text):
|
| 1836 |
"""
|
| 1837 |
-
|
| 1838 |
-
|
| 1839 |
"""
|
| 1840 |
-
#
|
| 1841 |
-
|
| 1842 |
-
text = re.sub(r'(\d)([A-Za-z])', r'\1 \2', text) # Add space between numbers and letters
|
| 1843 |
-
text = re.sub(r'([A-Za-z])(\d)', r'\1 \2', text) # Add space between letters and numbers
|
| 1844 |
|
| 1845 |
-
# Ensure proper Markdown URL formatting
|
| 1846 |
-
link_pattern = r'\[([^\]]+)\]\(([^)]+)\)'
|
| 1847 |
def encode_url(match):
|
| 1848 |
text = match.group(1)
|
| 1849 |
-
url = match.group(2).strip()
|
| 1850 |
-
encoded_url = quote(url, safe=':/')
|
| 1851 |
return f"[{text}]({encoded_url})"
|
| 1852 |
-
text = re.sub(link_pattern, encode_url, text)
|
| 1853 |
|
| 1854 |
-
#
|
| 1855 |
-
|
| 1856 |
-
text = re.sub(r'\s+', ' ', text) # Replace multiple spaces with a single space
|
| 1857 |
|
| 1858 |
-
|
| 1859 |
-
|
| 1860 |
|
|
|
|
|
|
|
| 1861 |
|
| 1862 |
if "missing_trustbucket_content" not in st.session_state:
|
| 1863 |
st.session_state["missing_trustbucket_content"] = None
|
|
@@ -2004,7 +1960,7 @@ def handle_prompt(prompt):
|
|
| 2004 |
cleaned_text = ""
|
| 2005 |
base_instructions = (
|
| 2006 |
"Avoid flowery language, typical AI phrases, or jargon. "
|
| 2007 |
-
"Sources must be the latest, valid
|
| 2008 |
"Strictly dont use trustbucket names in copy headings and content avoid it"
|
| 2009 |
)
|
| 2010 |
|
|
|
|
| 52 |
import logging
|
| 53 |
import asyncio
|
| 54 |
import re
|
| 55 |
+
import docx
|
|
|
|
| 56 |
|
| 57 |
# Set up logging to suppress Streamlit warnings about experimental functions
|
| 58 |
logging.getLogger('streamlit').setLevel(logging.ERROR)
|
|
|
|
| 110 |
"""
|
| 111 |
try:
|
| 112 |
# Read the file
|
| 113 |
+
doc = docx.Document(file)
|
| 114 |
# Extract all text
|
| 115 |
text = "\n".join([paragraph.text for paragraph in doc.paragraphs])
|
| 116 |
if not text.strip(): # Handle empty content
|
|
|
|
| 158 |
return merged_content
|
| 159 |
|
| 160 |
def upload_to_firebase(user_id, file):
|
| 161 |
+
"""
|
| 162 |
+
Upload document to Firebase, extract content, and add it to the knowledge base.
|
| 163 |
+
"""
|
| 164 |
+
content = convert_file_to_md(file) # Ensure this function extracts content correctly
|
| 165 |
if not content:
|
| 166 |
+
return None, "Failed to extract content from the file."
|
| 167 |
|
| 168 |
doc_id = str(uuid.uuid4())
|
| 169 |
document_data = {"content": content, "name": file.name}
|
| 170 |
|
| 171 |
+
# Save document to Firebase
|
| 172 |
db.child("users").child(user_id).child("KnowledgeBase").child(doc_id).set(document_data)
|
| 173 |
|
| 174 |
+
# Add content to the knowledge base
|
| 175 |
+
if "knowledge_base" not in st.session_state:
|
| 176 |
+
st.session_state["knowledge_base"] = []
|
| 177 |
+
st.session_state["knowledge_base"].append({"doc_id": doc_id, "content": content})
|
| 178 |
|
| 179 |
+
# Index the document content for semantic search
|
| 180 |
index_document_content(content, doc_id)
|
| 181 |
|
| 182 |
+
st.sidebar.success(f"Document '{file.name}' uploaded successfully and added to the knowledge base!")
|
| 183 |
+
return content, None
|
| 184 |
+
|
| 185 |
|
| 186 |
def index_document_content(doc_content, doc_id):
|
| 187 |
"""
|
| 188 |
Indexes the document content by splitting it into chunks and creating embeddings.
|
| 189 |
"""
|
| 190 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 191 |
texts = text_splitter.split_text(doc_content)
|
| 192 |
|
| 193 |
# Create embeddings for each chunk
|
|
|
|
| 201 |
st.session_state["vector_store"][doc_id] = vector_store
|
| 202 |
|
| 203 |
|
| 204 |
+
|
| 205 |
+
|
| 206 |
def fetch_trustbuilders(user_id):
|
| 207 |
"""
|
| 208 |
Retrieve TrustBuilders from Firebase for a specific user.
|
|
|
|
| 1041 |
"q": query,
|
| 1042 |
"sort": "date", # Sort results by date for freshness
|
| 1043 |
"hl": "en", # Language: English
|
| 1044 |
+
"gl": "uk", # Geolocation: United States
|
| 1045 |
}
|
| 1046 |
|
| 1047 |
# Perform the search
|
|
|
|
| 1067 |
|
| 1068 |
# RAG response function
|
| 1069 |
def rag_response(query):
|
| 1070 |
+
"""
|
| 1071 |
+
Handle queries by searching both static and dynamically uploaded knowledge base.
|
| 1072 |
+
"""
|
| 1073 |
try:
|
| 1074 |
+
# Retrieve relevant chunks from the vector store
|
| 1075 |
+
results = []
|
| 1076 |
+
if "vector_store" in st.session_state:
|
| 1077 |
+
for vector_store in st.session_state["vector_store"].values():
|
| 1078 |
+
results.extend(vector_store.similarity_search(query, k=3)) # Adjust `k` for the number of results
|
| 1079 |
+
|
| 1080 |
+
# Combine results into a context
|
| 1081 |
+
context = "\n".join([result.page_content for result in results])
|
| 1082 |
+
if not context:
|
| 1083 |
+
return "No relevant information found in the knowledge base."
|
| 1084 |
+
|
| 1085 |
+
# Generate AI response with the retrieved context
|
| 1086 |
+
prompt = f"Context:\n{context}\n\nQuestion: {query}\nAnswer:"
|
| 1087 |
+
llm = ChatOpenAI(model="gpt-4", temperature=0.3, api_key=openai_api_key)
|
| 1088 |
+
response = llm.invoke(prompt)
|
| 1089 |
+
|
| 1090 |
+
return response.content
|
|
|
|
|
|
|
|
|
|
| 1091 |
except Exception as e:
|
| 1092 |
logger.error(f"Error generating RAG response: {e}")
|
| 1093 |
+
return "An error occurred during the RAG response generation process."
|
| 1094 |
+
|
| 1095 |
|
| 1096 |
# Define tools
|
| 1097 |
@tool
|
|
|
|
| 1109 |
prompt_message = f"""
|
| 1110 |
** You are a Professional copywriter tasked with creating non-flowery fluid, interconnected marketing content that integrates Trust Builders into various formats for any organization. Your content should be compelling, factual, well-structured, concise, and based on the knowledgebase. Write in an active voice using the first-person perspective ("we"), and avoid the third-person perspective. Creatively interconnect trust-building elements to enhance flow and impact. Avoid using terms like Stability, Development, Competence, Relationship, Benefit, Vision, trust, beacon, beacon of hope, and realm, except where specified.
|
| 1111 |
|
| 1112 |
+
###Mandatory Verification Checklist
|
| 1113 |
Before submitting any content, ensure it includes:
|
| 1114 |
- **Specific Details**:
|
| 1115 |
- At least **three specific dollar amounts** with exact figures (e.g., "$127.5 million").
|
|
|
|
| 1123 |
- **Each point must be followed by**:
|
| 1124 |
- "This [specific benefit] for [specific audience]"
|
| 1125 |
- **Example**: "This reduces wait times by 47% for patients seeking emergency care."
|
| 1126 |
+
|
| 1127 |
+
## Souces and Specificty
|
| 1128 |
+
Replace vague phrases with specific details:
|
| 1129 |
+
- ❌ "many" → ✅ exact number.
|
| 1130 |
+
- ❌ "millions" → ✅ "$127.5 million".
|
| 1131 |
+
- ❌ "recently" → ✅ "March 15, 2023".
|
| 1132 |
+
- ❌ "global presence" → ✅ "offices in 127 cities across 45 countries".
|
| 1133 |
+
- ❌ "industry leader" → ✅ "ranked #1 in customer satisfaction by J.D. Power in 2023".
|
| 1134 |
+
- ❌ "significant impact" → ✅ "47% reduction in processing time".
|
| 1135 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1136 |
|
| 1137 |
### Critical Mandatory Instructions
|
| 1138 |
- **Avoid Prohibited Terms**: Do not mention "trust," "trust buckets," or category names like Development, Stability, Competence, Relationship, Vision in the copy, except for headings and searches.
|
|
|
|
| 1187 |
-Creative Techniques: examples (list only relevant marketing techniques without additional details).
|
| 1188 |
-Limit to 3-5 items in each category.
|
| 1189 |
|
| 1190 |
+
### 5.Trust-Based Queries:**
|
| 1191 |
+
###Be over specific with numbers,names,dollars, programs ,awards and action.
|
| 1192 |
- When a query seeks a specific number of trust builders (e.g., "5 trust builders"), the AI should:
|
| 1193 |
- Randomly pick the requested number of trust buckets from the six available: Development Trust, Competence Trust, Stability Trust, Relationship Trust, Benefit Trust, and Vision Trust.
|
| 1194 |
- For each selected bucket, find 15 TrustBuilders® points be over specific with numbers,names,dollars, programs ,awards and action.
|
|
|
|
| 1251 |
**Organization**
|
| 1252 |
- In **2023**, World Vision invested **$150 million** in sustainable agriculture programs across **35 countries**, impacting over **2 million** farmers.This improves food security for vulnerable communities.- [Source](#)der each main category, list the trust-building points directly as bullet points or numbered lists **without any additional subheadings, labels, descriptors, phrases, or words before the points**.
|
| 1253 |
|
| 1254 |
+
-- **Audience Relevance**:
|
| 1255 |
+
- Each point must be followed by a benefit for a specific audience (e.g., "This reduces wait times by 47% for patients seeking emergency care").
|
| 1256 |
+
|
| 1257 |
### General Queries
|
| 1258 |
- Do not use the knowledge base for non-trust content.
|
| 1259 |
- Always clarify the audience impact and ensure all information is based on verified sources.
|
|
|
|
| 1789 |
st.session_state["documents"] = {}
|
| 1790 |
st.session_state["vector_store"] = {}
|
| 1791 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1792 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1793 |
|
| 1794 |
|
|
|
|
| 1795 |
def clean_and_format_markdown(raw_text):
|
| 1796 |
"""
|
| 1797 |
+
Dynamically cleans and formats Markdown text to ensure URLs are properly encoded
|
| 1798 |
+
and handles issues with line breaks or improperly formatted Markdown.
|
| 1799 |
"""
|
| 1800 |
+
# Regular expression to find Markdown links [text](url)
|
| 1801 |
+
pattern = r'\[([^\]]+)\]\(([^)]+)\)'
|
|
|
|
|
|
|
| 1802 |
|
|
|
|
|
|
|
| 1803 |
def encode_url(match):
|
| 1804 |
text = match.group(1)
|
| 1805 |
+
url = match.group(2).strip() # Remove leading/trailing spaces
|
| 1806 |
+
encoded_url = quote(url, safe=':/') # Encode the URL while keeping : and /
|
| 1807 |
return f"[{text}]({encoded_url})"
|
|
|
|
| 1808 |
|
| 1809 |
+
# Fix Markdown links dynamically
|
| 1810 |
+
formatted_text = re.sub(pattern, encode_url, raw_text)
|
|
|
|
| 1811 |
|
| 1812 |
+
# Replace single newlines with spaces to avoid breaking Markdown rendering
|
| 1813 |
+
formatted_text = re.sub(r"(?<!\n)\n(?!\n)", " ", formatted_text)
|
| 1814 |
|
| 1815 |
+
return formatted_text
|
| 1816 |
+
|
| 1817 |
|
| 1818 |
if "missing_trustbucket_content" not in st.session_state:
|
| 1819 |
st.session_state["missing_trustbucket_content"] = None
|
|
|
|
| 1960 |
cleaned_text = ""
|
| 1961 |
base_instructions = (
|
| 1962 |
"Avoid flowery language, typical AI phrases, or jargon. "
|
| 1963 |
+
"Sources must be the latest, valid. ."
|
| 1964 |
"Strictly dont use trustbucket names in copy headings and content avoid it"
|
| 1965 |
)
|
| 1966 |
|