Spaces:

trustlogic
/

Live

Sleeping

App Files Files Community

Wajahat698 commited on Nov 24, 2024

Commit

fc8ff97

verified ·

1 Parent(s): 4dabaf1

Update app.py

Browse files

Files changed (1) hide show

app.py +70 -114

app.py CHANGED Viewed

@@ -52,8 +52,7 @@ st.set_page_config(layout="wide")
 import logging
 import asyncio
 import re
-from docx import Document as DocxDocument
 # Set up logging to suppress Streamlit warnings about experimental functions
 logging.getLogger('streamlit').setLevel(logging.ERROR)
@@ -111,7 +110,7 @@ def convert_docx_to_md(file):
     """
     try:
         # Read the file
-        doc = DocxDocument(uploaded_file)
         # Extract all text
         text = "\n".join([paragraph.text for paragraph in doc.paragraphs])
         if not text.strip():  # Handle empty content
@@ -159,36 +158,36 @@ def merge_markdown_contents(contents):
     return merged_content
 def upload_to_firebase(user_id, file):
-    content = convert_file_to_md(file)
     if not content:
-        return None, "Failed to convert file to content."
     doc_id = str(uuid.uuid4())
     document_data = {"content": content, "name": file.name}
-    # Save to Firebase
     db.child("users").child(user_id).child("KnowledgeBase").child(doc_id).set(document_data)
-    # Update session state
-    if "documents" not in st.session_state:
-        st.session_state["documents"] = {}
-    st.session_state["documents"][doc_id] = document_data
-    # Index the document content
     index_document_content(content, doc_id)
-    st.sidebar.success(f"Document '{file.name}' uploaded  successfully!")
-    return content,None
 def index_document_content(doc_content, doc_id):
     """
     Indexes the document content by splitting it into chunks and creating embeddings.
     """
-    # Split the document into chunks
-    text_splitter = RecursiveCharacterTextSplitter(
-        chunk_size=500,
-        chunk_overlap=50,
-    )
     texts = text_splitter.split_text(doc_content)
     # Create embeddings for each chunk
@@ -202,6 +201,8 @@ def index_document_content(doc_content, doc_id):
     st.session_state["vector_store"][doc_id] = vector_store
 def fetch_trustbuilders(user_id):
     """
     Retrieve TrustBuilders from Firebase for a specific user.
@@ -1040,7 +1041,7 @@ def google_search(query):
             "q": query,
             "sort": "date",  # Sort results by date for freshness
             "hl": "en",  # Language: English
-            "gl": "us",  # Geolocation: United States
         }
         # Perform the search
@@ -1066,30 +1067,31 @@ def google_search(query):
 # RAG response function
 def rag_response(query):
     try:
-        # Check if the query references uploaded documents
-        if "using uploaded document" in query.lower():
-            document_response = handle_document_query(query)  # Use your existing `handle_document_query` function
-            if document_response:
-                return document_response
-        else:
-            # Proceed with the existing knowledge base logic if no uploaded document context is specified
-            retrieved_docs = search_knowledge_base(query)
-            context = "\n".join(doc.page_content for doc in retrieved_docs)
-            # Prepare the prompt with the retrieved context
-            prompt = f"Context:\n{context}\n\nQuestion: {query}\nAnswer:"
-            llm = ChatOpenAI(model="gpt-4o", temperature=0.3, api_key=openai_api_key)
-            response = llm.invoke(prompt)
-            # Replace terms in the final output as per your restrictions
-            response_content = response.content
-            return response_content
     except Exception as e:
         logger.error(f"Error generating RAG response: {e}")
-        return "Error occurred during RAG response generation"
 # Define tools
 @tool
@@ -1107,7 +1109,7 @@ tools = [knowledge_base_tool, google_search_tool]
 prompt_message = f"""
 ** You are a Professional copywriter tasked with creating non-flowery fluid, interconnected marketing content that integrates Trust Builders into various formats for any organization. Your content should be compelling, factual, well-structured, concise, and based on the knowledgebase. Write in an active voice using the first-person perspective ("we"), and avoid the third-person perspective. Creatively interconnect trust-building elements to enhance flow and impact. Avoid using terms like Stability, Development, Competence, Relationship, Benefit, Vision, trust, beacon, beacon of hope, and realm, except where specified.
-### Mandatory Verification Checklist
 Before submitting any content, ensure it includes:
 - **Specific Details**:
   - At least **three specific dollar amounts** with exact figures (e.g., "$127.5 million").
@@ -1121,16 +1123,16 @@ Before submitting any content, ensure it includes:
    - **Each point must be followed by**:
      - "This [specific benefit] for [specific audience]"
      - **Example**: "This reduces wait times by 47% for patients seeking emergency care."
-### Sources and Specificity
-- Include **current and valid source links** next to each trust building point.
-- Replace vague phrases with specific details:
-  - "many" → exact number.
-  - "millions" → "$127.5 million".
-  - "recently" → "March 15, 2023".
-  - "global presence" → "offices in 127 cities across 45 countries".
-  - "industry leader" → "ranked #1 in customer satisfaction by J.D. Power in 2023".
-  - "significant impact" → "47% reduction in processing time".
 ### Critical Mandatory Instructions
 - **Avoid Prohibited Terms**: Do not mention "trust," "trust buckets," or category names like Development, Stability, Competence, Relationship, Vision in the copy, except for headings and searches.
@@ -1185,8 +1187,8 @@ Before submitting any content, ensure it includes:
         -Creative Techniques: examples (list only relevant marketing techniques without additional details).
      -Limit to 3-5 items in each category.
-### 5.Trust-Based Queries:**
- Be over specific with numbers,names,dollars, programs ,awards and action.
 - When a query seeks a specific number of trust builders (e.g., "5 trust builders"), the AI should:
          - Randomly pick the requested number of trust buckets from the six available: Development Trust, Competence Trust, Stability Trust, Relationship Trust, Benefit Trust, and Vision Trust.
          - For each selected bucket, find 15  TrustBuilders® points be over specific with numbers,names,dollars, programs ,awards and action.
@@ -1249,6 +1251,9 @@ Before submitting any content, ensure it includes:
   **Organization**
   - In **2023**, World Vision invested **$150 million** in sustainable agriculture programs across **35 countries**, impacting over **2 million** farmers.This improves food security for vulnerable communities.- [Source](#)der each main category, list the trust-building points directly as bullet points or numbered lists **without any additional subheadings, labels, descriptors, phrases, or words before the points**.
 ### General Queries
 - Do not use the knowledge base for non-trust content.
 - Always clarify the audience impact and ensure all information is based on verified sources.
@@ -1784,80 +1789,31 @@ def load_user_memory(user_id):
         st.session_state["documents"] = {}
         st.session_state["vector_store"] = {}
-def get_document_content(doc_name=None):
-    documents = st.session_state.get("documents", {})
-    if not documents:
-        return None, "No documents have been uploaded."
-    # If a specific document name is provided
-    if doc_name:
-        for doc_id, doc_data in documents.items():
-            if doc_data.get("name", "").lower() == doc_name.lower():
-                content = doc_data.get("content")
-                if content:
-                    return content, None
-                else:
-                    return None, f"Document '{doc_name}' does not contain any content."
-        return None, f"Document '{doc_name}' not found."
-    # Default to the most recent document
-    last_doc = list(documents.values())[-1]
-    content = last_doc.get("content")
-    if content:
-        return content, None
-    else:
-        return None, "The most recently uploaded document does not contain any content."
-def handle_document_query(query):
-    """
-    Handle queries related to uploaded documents for response generation.
-    """
-    # Extract specific document name if mentioned
-    doc_name_match = re.search(r"document\s+'([^']+)'", query, re.IGNORECASE)
-    doc_name = doc_name_match.group(1) if doc_name_match else None
-    # Fetch document content
-    doc_content, error = get_document_content(doc_name)
-    if error:
-        return error
-    # Generate AI response with document context
-    full_prompt = f"Document Content:\n{doc_content}\n\nUser Query: {query}\n\nResponse:"
-    try:
-        llm = ChatOpenAI(model="gpt-4o", temperature=0.5, api_key=openai_api_key)
-        response = llm.invoke(full_prompt)
-        return response.content
-    except Exception as e:
-        return f"Error generating response using the document: {e}"
 def clean_and_format_markdown(raw_text):
     """
-    Cleans up formatting issues in dynamically generated text.
-    Fixes missing spaces, ensures proper sentence structure, and formats Markdown.
     """
-    # Fix missing spaces between words (e.g., "430billiontotheU.S.economy")
-    text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text)  # Add space between lowercase and uppercase
-    text = re.sub(r'(\d)([A-Za-z])', r'\1 \2', text)  # Add space between numbers and letters
-    text = re.sub(r'([A-Za-z])(\d)', r'\1 \2', text)  # Add space between letters and numbers
-    # Ensure proper Markdown URL formatting
-    link_pattern = r'\[([^\]]+)\]\(([^)]+)\)'
     def encode_url(match):
         text = match.group(1)
-        url = match.group(2).strip()
-        encoded_url = quote(url, safe=':/')
         return f"[{text}]({encoded_url})"
-    text = re.sub(link_pattern, encode_url, text)
-    # Ensure proper sentence spacing and line breaks
-    text = re.sub(r'(?<!\n)\n(?!\n)', ' ', text)  # Replace single newlines with spaces
-    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
-    return text
 if "missing_trustbucket_content" not in st.session_state:
     st.session_state["missing_trustbucket_content"] = None
@@ -2004,7 +1960,7 @@ def handle_prompt(prompt):
                         cleaned_text = ""
                         base_instructions = (
                             "Avoid flowery language, typical AI phrases, or jargon. "
-                            "Sources must be the latest, valid, and verifiable ."
                             "Strictly dont use trustbucket names in copy headings and content avoid it"
                         )

 import logging
 import asyncio
 import re
+import docx
 # Set up logging to suppress Streamlit warnings about experimental functions
 logging.getLogger('streamlit').setLevel(logging.ERROR)
     """
     try:
         # Read the file
+        doc =  docx.Document(file)
         # Extract all text
         text = "\n".join([paragraph.text for paragraph in doc.paragraphs])
         if not text.strip():  # Handle empty content
     return merged_content
 def upload_to_firebase(user_id, file):
+    """
+    Upload document to Firebase, extract content, and add it to the knowledge base.
+    """
+    content = convert_file_to_md(file)  # Ensure this function extracts content correctly
     if not content:
+        return None, "Failed to extract content from the file."
     doc_id = str(uuid.uuid4())
     document_data = {"content": content, "name": file.name}
+    # Save document to Firebase
     db.child("users").child(user_id).child("KnowledgeBase").child(doc_id).set(document_data)
+    # Add content to the knowledge base
+    if "knowledge_base" not in st.session_state:
+        st.session_state["knowledge_base"] = []
+    st.session_state["knowledge_base"].append({"doc_id": doc_id, "content": content})
+    # Index the document content for semantic search
     index_document_content(content, doc_id)
+    st.sidebar.success(f"Document '{file.name}' uploaded successfully and added to the knowledge base!")
+    return content, None
 def index_document_content(doc_content, doc_id):
     """
     Indexes the document content by splitting it into chunks and creating embeddings.
     """
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
     texts = text_splitter.split_text(doc_content)
     # Create embeddings for each chunk
     st.session_state["vector_store"][doc_id] = vector_store
 def fetch_trustbuilders(user_id):
     """
     Retrieve TrustBuilders from Firebase for a specific user.
             "q": query,
             "sort": "date",  # Sort results by date for freshness
             "hl": "en",  # Language: English
+            "gl": "uk",  # Geolocation: United States
         }
         # Perform the search
 # RAG response function
 def rag_response(query):
+    """
+    Handle queries by searching both static and dynamically uploaded knowledge base.
+    """
     try:
+        # Retrieve relevant chunks from the vector store
+        results = []
+        if "vector_store" in st.session_state:
+            for vector_store in st.session_state["vector_store"].values():
+                results.extend(vector_store.similarity_search(query, k=3))  # Adjust `k` for the number of results
+        # Combine results into a context
+        context = "\n".join([result.page_content for result in results])
+        if not context:
+            return "No relevant information found in the knowledge base."
+        # Generate AI response with the retrieved context
+        prompt = f"Context:\n{context}\n\nQuestion: {query}\nAnswer:"
+        llm = ChatOpenAI(model="gpt-4", temperature=0.3, api_key=openai_api_key)
+        response = llm.invoke(prompt)
+        return response.content
     except Exception as e:
         logger.error(f"Error generating RAG response: {e}")
+        return "An error occurred during the RAG response generation process."
 # Define tools
 @tool
 prompt_message = f"""
 ** You are a Professional copywriter tasked with creating non-flowery fluid, interconnected marketing content that integrates Trust Builders into various formats for any organization. Your content should be compelling, factual, well-structured, concise, and based on the knowledgebase. Write in an active voice using the first-person perspective ("we"), and avoid the third-person perspective. Creatively interconnect trust-building elements to enhance flow and impact. Avoid using terms like Stability, Development, Competence, Relationship, Benefit, Vision, trust, beacon, beacon of hope, and realm, except where specified.
+###Mandatory Verification Checklist
 Before submitting any content, ensure it includes:
 - **Specific Details**:
   - At least **three specific dollar amounts** with exact figures (e.g., "$127.5 million").
    - **Each point must be followed by**:
      - "This [specific benefit] for [specific audience]"
      - **Example**: "This reduces wait times by 47% for patients seeking emergency care."
+## Souces and Specificty
+Replace vague phrases with specific details:
+- ❌ "many" → ✅ exact number.
+- ❌ "millions" → ✅ "$127.5 million".
+- ❌ "recently" → ✅ "March 15, 2023".
+- ❌ "global presence" → ✅ "offices in 127 cities across 45 countries".
+- ❌ "industry leader" → ✅ "ranked #1 in customer satisfaction by J.D. Power in 2023".
+- ❌ "significant impact" → ✅ "47% reduction in processing time".
 ### Critical Mandatory Instructions
 - **Avoid Prohibited Terms**: Do not mention "trust," "trust buckets," or category names like Development, Stability, Competence, Relationship, Vision in the copy, except for headings and searches.
         -Creative Techniques: examples (list only relevant marketing techniques without additional details).
      -Limit to 3-5 items in each category.
+### 5.Trust-Based Queries:**
+###Be over specific with numbers,names,dollars, programs ,awards and action.
 - When a query seeks a specific number of trust builders (e.g., "5 trust builders"), the AI should:
          - Randomly pick the requested number of trust buckets from the six available: Development Trust, Competence Trust, Stability Trust, Relationship Trust, Benefit Trust, and Vision Trust.
          - For each selected bucket, find 15  TrustBuilders® points be over specific with numbers,names,dollars, programs ,awards and action.
   **Organization**
   - In **2023**, World Vision invested **$150 million** in sustainable agriculture programs across **35 countries**, impacting over **2 million** farmers.This improves food security for vulnerable communities.- [Source](#)der each main category, list the trust-building points directly as bullet points or numbered lists **without any additional subheadings, labels, descriptors, phrases, or words before the points**.
+-- **Audience Relevance**:
+  - Each point must be followed by a benefit for a specific audience (e.g., "This reduces wait times by 47% for patients seeking emergency care").
 ### General Queries
 - Do not use the knowledge base for non-trust content.
 - Always clarify the audience impact and ensure all information is based on verified sources.
         st.session_state["documents"] = {}
         st.session_state["vector_store"] = {}
 def clean_and_format_markdown(raw_text):
     """
+    Dynamically cleans and formats Markdown text to ensure URLs are properly encoded
+    and handles issues with line breaks or improperly formatted Markdown.
     """
+    # Regular expression to find Markdown links [text](url)
+    pattern = r'\[([^\]]+)\]\(([^)]+)\)'
     def encode_url(match):
         text = match.group(1)
+        url = match.group(2).strip()  # Remove leading/trailing spaces
+        encoded_url = quote(url, safe=':/')  # Encode the URL while keeping : and /
         return f"[{text}]({encoded_url})"
+    # Fix Markdown links dynamically
+    formatted_text = re.sub(pattern, encode_url, raw_text)
+    # Replace single newlines with spaces to avoid breaking Markdown rendering
+    formatted_text = re.sub(r"(?<!\n)\n(?!\n)", " ", formatted_text)
+    return formatted_text
 if "missing_trustbucket_content" not in st.session_state:
     st.session_state["missing_trustbucket_content"] = None
                         cleaned_text = ""
                         base_instructions = (
                             "Avoid flowery language, typical AI phrases, or jargon. "
+                            "Sources must be the latest, valid. ."
                             "Strictly dont use trustbucket names in copy headings and content avoid it"
                         )