Spaces:

Ismetdh
/

SimpleRAG

Sleeping

App Files Files Community

Ismetdh commited on Feb 10, 2025

Commit

f5c61b7

verified ·

1 Parent(s): 2ae4087

Update app.py

Browse files

Remove comments

Files changed (1) hide show

app.py +11 -103

app.py CHANGED Viewed

@@ -1,39 +1,28 @@
 import streamlit as st
-import pdfplumber  # For PDF extraction
-import docx        # For DOCX extraction
 import os
 import re
 import numpy as np
-import google.generativeai as palm  # For embedding generation
 from sklearn.metrics.pairwise import cosine_similarity
 import logging
 import time
 import uuid
 import json
-# Firebase integration imports
 import firebase_admin
 from firebase_admin import credentials, firestore
-# -------------------------
-# Firebase Initialization using Firestore
-# -------------------------
 def init_firebase():
     if not firebase_admin._apps:
-        # Replace with the path to your Firebase service account key JSON file.
         data = json.loads(os.getenv("FIREBASE_CRED"))
         cred = credentials.Certificate(data)
-        # No databaseURL is provided because we're using Firestore.
         firebase_admin.initialize_app(cred)
 init_firebase()
-# Create a Firestore client
 fs_client = firestore.client()
 def save_conversation_to_firestore(session_id, user_question, assistant_answer, feedback=None):
-    """
-    Save a complete conversation (user question + assistant answer + feedback) as a single document.
-    """
     conv_ref = fs_client.collection("sessions").document(session_id).collection("conversations")
     data = {
         "user_question": user_question,
@@ -41,32 +30,21 @@ def save_conversation_to_firestore(session_id, user_question, assistant_answer,
         "feedback": feedback,
         "timestamp": firestore.SERVER_TIMESTAMP
     }
-    # Add a new document with an auto-generated ID.
     doc_ref = conv_ref.add(data)
-    # doc_ref returns a tuple (write_result, document_reference)
     return doc_ref[1].id
-# -------------------------
-# Firestore Helper Functions
-# -------------------------
 def save_message_to_firestore(session_id, role, content, feedback=None):
-    """
-    Save a message to Firestore under sessions/{session_id}/messages.
-    """
     messages_ref = fs_client.collection("sessions").document(session_id).collection("messages")
     data = {
         "role": role,
         "content": content,
         "feedback": feedback,
-        "timestamp": firestore.SERVER_TIMESTAMP  # Server will set the timestamp
     }
-    # Add a new document with an auto-generated ID.
     doc_ref = messages_ref.add(data)
-    # doc_ref returns a tuple (write_result, document_reference)
     return doc_ref[1].id
 def handle_feedback(feedback_val):
-    # Update Firestore and update local conversation history
     update_feedback_in_firestore(
         st.session_state.session_id,
         st.session_state.latest_conversation_id,
@@ -74,11 +52,7 @@ def handle_feedback(feedback_val):
     )
     st.session_state.conversations[-1]["feedback"] = feedback_val
 def fetch_messages_from_firestore(session_id):
-    """
-    Fetch all messages for the given session from Firestore, ordered by timestamp.
-    """
     messages_ref = fs_client.collection("sessions").document(session_id).collection("messages")
     docs = messages_ref.order_by("timestamp").stream()
     messages = []
@@ -89,42 +63,27 @@ def fetch_messages_from_firestore(session_id):
     return messages
 def update_feedback_in_firestore(session_id, conversation_id, feedback):
-    """
-    Update the feedback field for a conversation document.
-    """
     conv_doc = fs_client.collection("sessions").document(session_id).collection("conversations").document(conversation_id)
     conv_doc.update({"feedback": feedback})
-# -------------------------
-# Configuration
-# -------------------------
 class Config:
     CHUNK_WORDS = 300
-    EMBEDDING_MODEL = "models/text-embedding-004"  # Update as needed.
     TOP_N = 3
     SYSTEM_PROMPT = (
         "You are a helpful assistant. Answer the question using the provided context. "
     )
-    GENERATION_MODEL = "models/gemini-1.5-flash"
-# -------------------------
-# API Key and Initialization for Generative AI
-# -------------------------
 API_KEY = os.getenv("GOOGLE_API_KEY")
 if not API_KEY:
     st.error("Google API key is not configured.")
     st.stop()
 palm.configure(api_key=API_KEY)
-# -------------------------
-# Logging Configuration
-# -------------------------
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-# -------------------------
-# Cached Embedding Function
-# -------------------------
 @st.cache_data(show_spinner=True)
 def generate_embedding_cached(text: str) -> list:
     logger.info("Calling API for embedding generation. Text snippet: %s", text[:50])
@@ -137,7 +96,7 @@ def generate_embedding_cached(text: str) -> list:
         if "embedding" not in response or not response["embedding"]:
             logger.error("No embedding returned from API.")
             st.error("No embedding returned. Please verify your API settings and input text.")
-            return [0.0] * 768  # Fallback: list of zeros
         embedding = np.array(response["embedding"])
         if embedding.ndim == 2:
             embedding = embedding.flatten()
@@ -155,9 +114,6 @@ def generate_embedding(text: str) -> np.ndarray:
     embedding_list = generate_embedding_cached(text)
     return np.array(embedding_list)
-# -------------------------
-# File Handling
-# -------------------------
 def extract_text_from_file(uploaded_file) -> str:
     file_name = uploaded_file.name.lower()
     if file_name.endswith(".txt"):
@@ -180,16 +136,12 @@ def extract_text_from_file(uploaded_file) -> str:
     else:
         raise ValueError("Unsupported file type. Please upload a .txt, .pdf, or .docx file.")
-# -------------------------
-# Chunking the Document
-# -------------------------
 def chunk_text(text: str) -> list[str]:
     max_words = Config.CHUNK_WORDS
     paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
     chunks = []
     current_chunk = ""
     current_word_count = 0
     for paragraph in paragraphs:
         para_word_count = len(paragraph.split())
         if para_word_count > max_words:
@@ -221,21 +173,15 @@ def chunk_text(text: str) -> list[str]:
             else:
                 current_chunk += paragraph + "\n\n"
                 current_word_count += para_word_count
     if current_chunk:
         chunks.append(current_chunk.strip())
     return chunks
-# -------------------------
-# Process Document (Extract, Chunk, Embed)
-# -------------------------
 def process_document(uploaded_file) -> None:
     try:
-        # Clear only document-related keys.
         keys_to_clear = ["document_text", "document_chunks", "document_embeddings"]
         for key in keys_to_clear:
             st.session_state.pop(key, None)
         file_text = extract_text_from_file(uploaded_file)
         if not file_text.strip():
             logger.error("Uploaded file contains no valid text.")
@@ -264,15 +210,11 @@ def process_document(uploaded_file) -> None:
         logger.error("Document processing failed: %s", e)
         st.error(f"An error occurred while processing the document: {e}")
-# -------------------------
-# Retrieve Relevant Chunks
-# -------------------------
 def search_query(query: str) -> list[tuple[str, float]]:
     if "document_embeddings" not in st.session_state or len(st.session_state["document_embeddings"]) == 0:
         logger.error("No valid document embeddings found in session state.")
         st.error("No valid document embeddings found. Please upload a valid document.")
         return []
     query_embedding = generate_embedding(query)
     if np.all(query_embedding == 0):
         logger.error("Query embedding is a zero vector.")
@@ -285,9 +227,6 @@ def search_query(query: str) -> list[tuple[str, float]]:
     results = [(st.session_state["document_chunks"][i], similarities[i]) for i in top_indices]
     return results
-# -------------------------
-# Generate Answer from LLM (RAG)
-# -------------------------
 def generate_answer(user_query: str, context: str) -> str:
     prompt = (
         f"System: {Config.SYSTEM_PROMPT}\n\n"
@@ -306,79 +245,48 @@ def generate_answer(user_query: str, context: str) -> str:
         st.error("Failed to generate answer. Please check your input and try again.")
         return "I'm sorry, I encountered an error generating a response."
-# -------------------------
-# Chat Interface
-# -------------------------
 def chat_app():
-    # Initialize conversation history and session ID if not already set.
     if "conversations" not in st.session_state:
-        st.session_state.conversations = []  # Each element is a dict with keys: user_question, assistant_answer, (optionally) feedback
     if "session_id" not in st.session_state:
         st.session_state.session_id = str(uuid.uuid4())
-    # Display past conversations
     for conv in st.session_state.conversations:
-        # Display the user's question
         with st.chat_message("user"):
             st.write(conv.get("user_question", ""))
-        # Display the assistant's answer
         with st.chat_message("assistant"):
             st.write(conv.get("assistant_answer", ""))
-            # Optionally, display feedback if available
             if conv.get("feedback"):
                 st.markdown(f"**Feedback:** {conv['feedback']}")
-    # Get new user input
     user_input = st.chat_input("Type your message here")
     if user_input:
-        # Display the user input immediately.
         with st.chat_message("user"):
             st.write(user_input)
-        # Retrieve relevant document chunks from the processed document.
         results = search_query(user_input)
         context = "\n\n".join([chunk for chunk, score in results]) if results else ""
-        # Generate the assistant's answer using the retrieved context.
         answer = generate_answer(user_input, context)
         with st.chat_message("assistant"):
             st.write(answer)
-        # Save the whole conversation (user question + assistant answer) as one document.
         conversation_id = save_conversation_to_firestore(
             st.session_state.session_id,
             user_question=user_input,
             assistant_answer=answer
         )
         st.session_state.latest_conversation_id = conversation_id
-        # Append the conversation to session state (for UI history)
         st.session_state.conversations.append({
             "user_question": user_input,
             "assistant_answer": answer,
         })
-        # Instead of a radio button, show two buttons for like/dislike.
-        # Only show these buttons if the latest conversation has not yet been rated.
         if "feedback" not in st.session_state.conversations[-1]:
-            col1, col2,col3,col4,col5,col6,col7,col8,col9,col10 = st.columns(10)
-            col1.button("👍", key=f"feedback_like_{len(st.session_state.conversations)}",
-            on_click=handle_feedback, args=("positive",))
-            col2.button("👎", key=f"feedback_dislike_{len(st.session_state.conversations)}",
-            on_click=handle_feedback, args=("negative",))
-# -------------------------
-# Main Application (Streamlit)
-# -------------------------
 def main():
     st.title("Code : Beta")
     st.sidebar.header("Upload Document")
     uploaded_file = st.sidebar.file_uploader("Upload (.txt, .pdf, .docx)", type=["txt", "pdf", "docx"])
-    # Process the document only if uploaded and not already processed.
     if uploaded_file and not st.session_state.get("doc_processed", False):
         process_document(uploaded_file)
     if "document_text" in st.session_state:
         chat_app()
     else:

 import streamlit as st
+import pdfplumber
+import docx
 import os
 import re
 import numpy as np
+import google.generativeai as palm
 from sklearn.metrics.pairwise import cosine_similarity
 import logging
 import time
 import uuid
 import json
 import firebase_admin
 from firebase_admin import credentials, firestore
 def init_firebase():
     if not firebase_admin._apps:
         data = json.loads(os.getenv("FIREBASE_CRED"))
         cred = credentials.Certificate(data)
         firebase_admin.initialize_app(cred)
 init_firebase()
 fs_client = firestore.client()
 def save_conversation_to_firestore(session_id, user_question, assistant_answer, feedback=None):
     conv_ref = fs_client.collection("sessions").document(session_id).collection("conversations")
     data = {
         "user_question": user_question,
         "feedback": feedback,
         "timestamp": firestore.SERVER_TIMESTAMP
     }
     doc_ref = conv_ref.add(data)
     return doc_ref[1].id
 def save_message_to_firestore(session_id, role, content, feedback=None):
     messages_ref = fs_client.collection("sessions").document(session_id).collection("messages")
     data = {
         "role": role,
         "content": content,
         "feedback": feedback,
+        "timestamp": firestore.SERVER_TIMESTAMP
     }
     doc_ref = messages_ref.add(data)
     return doc_ref[1].id
 def handle_feedback(feedback_val):
     update_feedback_in_firestore(
         st.session_state.session_id,
         st.session_state.latest_conversation_id,
     )
     st.session_state.conversations[-1]["feedback"] = feedback_val
 def fetch_messages_from_firestore(session_id):
     messages_ref = fs_client.collection("sessions").document(session_id).collection("messages")
     docs = messages_ref.order_by("timestamp").stream()
     messages = []
     return messages
 def update_feedback_in_firestore(session_id, conversation_id, feedback):
     conv_doc = fs_client.collection("sessions").document(session_id).collection("conversations").document(conversation_id)
     conv_doc.update({"feedback": feedback})
 class Config:
     CHUNK_WORDS = 300
+    EMBEDDING_MODEL = "models/text-embedding-004"
     TOP_N = 3
     SYSTEM_PROMPT = (
         "You are a helpful assistant. Answer the question using the provided context. "
     )
+    GENERATION_MODEL = "models/gemini-1.5-flash"
 API_KEY = os.getenv("GOOGLE_API_KEY")
 if not API_KEY:
     st.error("Google API key is not configured.")
     st.stop()
 palm.configure(api_key=API_KEY)
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 @st.cache_data(show_spinner=True)
 def generate_embedding_cached(text: str) -> list:
     logger.info("Calling API for embedding generation. Text snippet: %s", text[:50])
         if "embedding" not in response or not response["embedding"]:
             logger.error("No embedding returned from API.")
             st.error("No embedding returned. Please verify your API settings and input text.")
+            return [0.0] * 768
         embedding = np.array(response["embedding"])
         if embedding.ndim == 2:
             embedding = embedding.flatten()
     embedding_list = generate_embedding_cached(text)
     return np.array(embedding_list)
 def extract_text_from_file(uploaded_file) -> str:
     file_name = uploaded_file.name.lower()
     if file_name.endswith(".txt"):
     else:
         raise ValueError("Unsupported file type. Please upload a .txt, .pdf, or .docx file.")
 def chunk_text(text: str) -> list[str]:
     max_words = Config.CHUNK_WORDS
     paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
     chunks = []
     current_chunk = ""
     current_word_count = 0
     for paragraph in paragraphs:
         para_word_count = len(paragraph.split())
         if para_word_count > max_words:
             else:
                 current_chunk += paragraph + "\n\n"
                 current_word_count += para_word_count
     if current_chunk:
         chunks.append(current_chunk.strip())
     return chunks
 def process_document(uploaded_file) -> None:
     try:
         keys_to_clear = ["document_text", "document_chunks", "document_embeddings"]
         for key in keys_to_clear:
             st.session_state.pop(key, None)
         file_text = extract_text_from_file(uploaded_file)
         if not file_text.strip():
             logger.error("Uploaded file contains no valid text.")
         logger.error("Document processing failed: %s", e)
         st.error(f"An error occurred while processing the document: {e}")
 def search_query(query: str) -> list[tuple[str, float]]:
     if "document_embeddings" not in st.session_state or len(st.session_state["document_embeddings"]) == 0:
         logger.error("No valid document embeddings found in session state.")
         st.error("No valid document embeddings found. Please upload a valid document.")
         return []
     query_embedding = generate_embedding(query)
     if np.all(query_embedding == 0):
         logger.error("Query embedding is a zero vector.")
     results = [(st.session_state["document_chunks"][i], similarities[i]) for i in top_indices]
     return results
 def generate_answer(user_query: str, context: str) -> str:
     prompt = (
         f"System: {Config.SYSTEM_PROMPT}\n\n"
         st.error("Failed to generate answer. Please check your input and try again.")
         return "I'm sorry, I encountered an error generating a response."
 def chat_app():
     if "conversations" not in st.session_state:
+        st.session_state.conversations = []
     if "session_id" not in st.session_state:
         st.session_state.session_id = str(uuid.uuid4())
     for conv in st.session_state.conversations:
         with st.chat_message("user"):
             st.write(conv.get("user_question", ""))
         with st.chat_message("assistant"):
             st.write(conv.get("assistant_answer", ""))
             if conv.get("feedback"):
                 st.markdown(f"**Feedback:** {conv['feedback']}")
     user_input = st.chat_input("Type your message here")
     if user_input:
         with st.chat_message("user"):
             st.write(user_input)
         results = search_query(user_input)
         context = "\n\n".join([chunk for chunk, score in results]) if results else ""
         answer = generate_answer(user_input, context)
         with st.chat_message("assistant"):
             st.write(answer)
         conversation_id = save_conversation_to_firestore(
             st.session_state.session_id,
             user_question=user_input,
             assistant_answer=answer
         )
         st.session_state.latest_conversation_id = conversation_id
         st.session_state.conversations.append({
             "user_question": user_input,
             "assistant_answer": answer,
         })
         if "feedback" not in st.session_state.conversations[-1]:
+            col1, col2, col3, col4, col5, col6, col7, col8, col9, col10 = st.columns(10)
+            col1.button("👍", key=f"feedback_like_{len(st.session_state.conversations)}", on_click=handle_feedback, args=("positive",))
+            col2.button("👎", key=f"feedback_dislike_{len(st.session_state.conversations)}", on_click=handle_feedback, args=("negative",))
 def main():
     st.title("Code : Beta")
     st.sidebar.header("Upload Document")
     uploaded_file = st.sidebar.file_uploader("Upload (.txt, .pdf, .docx)", type=["txt", "pdf", "docx"])
     if uploaded_file and not st.session_state.get("doc_processed", False):
         process_document(uploaded_file)
     if "document_text" in st.session_state:
         chat_app()
     else: