Spaces:

Ismetdh
/

SimpleRAG

Running

App Files Files Community

Ismetdh commited on Feb 10, 2025

Commit

98d0333

verified ·

1 Parent(s): 8849b0e

Create app.py

Browse files

Files changed (1) hide show

app.py +388 -0

app.py ADDED Viewed

	@@ -0,0 +1,388 @@

+import streamlit as st
+import pdfplumber  # For PDF extraction
+import docx        # For DOCX extraction
+import os
+import re
+import numpy as np
+import google.generativeai as palm  # For embedding generation
+from sklearn.metrics.pairwise import cosine_similarity
+import logging
+import time
+import uuid
+import json
+# Firebase integration imports
+import firebase_admin
+from firebase_admin import credentials, firestore
+# -------------------------
+# Firebase Initialization using Firestore
+# -------------------------
+def init_firebase():
+    if not firebase_admin._apps:
+        # Replace with the path to your Firebase service account key JSON file.
+        data = json.loads(os.getenv("FIREBASE_CRED"))
+        cred = credentials.Certificate(data)
+        # No databaseURL is provided because we're using Firestore.
+        firebase_admin.initialize_app(cred)
+init_firebase()
+# Create a Firestore client
+fs_client = firestore.client()
+def save_conversation_to_firestore(session_id, user_question, assistant_answer, feedback=None):
+    """
+    Save a complete conversation (user question + assistant answer + feedback) as a single document.
+    """
+    conv_ref = fs_client.collection("sessions").document(session_id).collection("conversations")
+    data = {
+        "user_question": user_question,
+        "assistant_answer": assistant_answer,
+        "feedback": feedback,
+        "timestamp": firestore.SERVER_TIMESTAMP
+    }
+    # Add a new document with an auto-generated ID.
+    doc_ref = conv_ref.add(data)
+    # doc_ref returns a tuple (write_result, document_reference)
+    return doc_ref[1].id
+# -------------------------
+# Firestore Helper Functions
+# -------------------------
+def save_message_to_firestore(session_id, role, content, feedback=None):
+    """
+    Save a message to Firestore under sessions/{session_id}/messages.
+    """
+    messages_ref = fs_client.collection("sessions").document(session_id).collection("messages")
+    data = {
+        "role": role,
+        "content": content,
+        "feedback": feedback,
+        "timestamp": firestore.SERVER_TIMESTAMP  # Server will set the timestamp
+    }
+    # Add a new document with an auto-generated ID.
+    doc_ref = messages_ref.add(data)
+    # doc_ref returns a tuple (write_result, document_reference)
+    return doc_ref[1].id
+def handle_feedback(feedback_val):
+    # Update Firestore and update local conversation history
+    update_feedback_in_firestore(
+        st.session_state.session_id,
+        st.session_state.latest_conversation_id,
+        feedback_val
+    )
+    st.session_state.conversations[-1]["feedback"] = feedback_val
+def fetch_messages_from_firestore(session_id):
+    """
+    Fetch all messages for the given session from Firestore, ordered by timestamp.
+    """
+    messages_ref = fs_client.collection("sessions").document(session_id).collection("messages")
+    docs = messages_ref.order_by("timestamp").stream()
+    messages = []
+    for doc in docs:
+        data = doc.to_dict()
+        data["id"] = doc.id
+        messages.append(data)
+    return messages
+def update_feedback_in_firestore(session_id, conversation_id, feedback):
+    """
+    Update the feedback field for a conversation document.
+    """
+    conv_doc = fs_client.collection("sessions").document(session_id).collection("conversations").document(conversation_id)
+    conv_doc.update({"feedback": feedback})
+# -------------------------
+# Configuration
+# -------------------------
+class Config:
+    CHUNK_WORDS = 300
+    EMBEDDING_MODEL = "models/text-embedding-004"  # Update as needed.
+    TOP_N = 3
+    SYSTEM_PROMPT = (
+        "You are a helpful assistant. Answer the question using the provided context. "
+    )
+    GENERATION_MODEL = "models/gemini-1.5-flash"
+# -------------------------
+# API Key and Initialization for Generative AI
+# -------------------------
+API_KEY = os.getenv("GOOGLE_API_KEY")
+if not API_KEY:
+    st.error("Google API key is not configured.")
+    st.stop()
+palm.configure(api_key=API_KEY)
+# -------------------------
+# Logging Configuration
+# -------------------------
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# -------------------------
+# Cached Embedding Function
+# -------------------------
+@st.cache_data(show_spinner=True)
+def generate_embedding_cached(text: str) -> list:
+    logger.info("Calling API for embedding generation. Text snippet: %s", text[:50])
+    try:
+        response = palm.embed_content(
+            model=Config.EMBEDDING_MODEL,
+            content=text,
+            task_type="retrieval_document"
+        )
+        if "embedding" not in response or not response["embedding"]:
+            logger.error("No embedding returned from API.")
+            st.error("No embedding returned. Please verify your API settings and input text.")
+            return [0.0] * 768  # Fallback: list of zeros
+        embedding = np.array(response["embedding"])
+        if embedding.ndim == 2:
+            embedding = embedding.flatten()
+        elif embedding.ndim > 2:
+            logger.error("Embedding has more than 2 dimensions.")
+            st.error("Invalid embedding dimensions. Please check the API response.")
+            return [0.0] * 768
+        return embedding.tolist()
+    except Exception as e:
+        logger.error("Embedding generation failed: %s", e)
+        st.error(f"Embedding generation failed: {e}")
+        return [0.0] * 768
+def generate_embedding(text: str) -> np.ndarray:
+    embedding_list = generate_embedding_cached(text)
+    return np.array(embedding_list)
+# -------------------------
+# File Handling
+# -------------------------
+def extract_text_from_file(uploaded_file) -> str:
+    file_name = uploaded_file.name.lower()
+    if file_name.endswith(".txt"):
+        logger.info("Processing TXT file.")
+        return uploaded_file.read().decode("utf-8")
+    elif file_name.endswith(".pdf"):
+        logger.info("Processing PDF file.")
+        with pdfplumber.open(uploaded_file) as pdf:
+            text = "\n".join([page.extract_text() for page in pdf.pages if page.extract_text()])
+            if not text:
+                logger.error("PDF extraction returned empty text.")
+            return text
+    elif file_name.endswith(".docx"):
+        logger.info("Processing DOCX file.")
+        doc = docx.Document(uploaded_file)
+        text = "\n".join([para.text for para in doc.paragraphs])
+        if not text:
+            logger.error("DOCX extraction returned empty text.")
+        return text
+    else:
+        raise ValueError("Unsupported file type. Please upload a .txt, .pdf, or .docx file.")
+# -------------------------
+# Chunking the Document
+# -------------------------
+def chunk_text(text: str) -> list[str]:
+    max_words = Config.CHUNK_WORDS
+    paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
+    chunks = []
+    current_chunk = ""
+    current_word_count = 0
+    for paragraph in paragraphs:
+        para_word_count = len(paragraph.split())
+        if para_word_count > max_words:
+            if current_chunk:
+                chunks.append(current_chunk.strip())
+                current_chunk = ""
+                current_word_count = 0
+            sentences = re.split(r'(?<=[.!?])\s+', paragraph)
+            temp_chunk = ""
+            temp_word_count = 0
+            for sentence in sentences:
+                sentence_word_count = len(sentence.split())
+                if temp_word_count + sentence_word_count > max_words:
+                    if temp_chunk:
+                        chunks.append(temp_chunk.strip())
+                    temp_chunk = sentence + " "
+                    temp_word_count = sentence_word_count
+                else:
+                    temp_chunk += sentence + " "
+                    temp_word_count += sentence_word_count
+            if temp_chunk:
+                chunks.append(temp_chunk.strip())
+        else:
+            if current_word_count + para_word_count > max_words:
+                if current_chunk:
+                    chunks.append(current_chunk.strip())
+                current_chunk = paragraph + "\n\n"
+                current_word_count = para_word_count
+            else:
+                current_chunk += paragraph + "\n\n"
+                current_word_count += para_word_count
+    if current_chunk:
+        chunks.append(current_chunk.strip())
+    return chunks
+# -------------------------
+# Process Document (Extract, Chunk, Embed)
+# -------------------------
+def process_document(uploaded_file) -> None:
+    try:
+        # Clear only document-related keys.
+        keys_to_clear = ["document_text", "document_chunks", "document_embeddings"]
+        for key in keys_to_clear:
+            st.session_state.pop(key, None)
+        file_text = extract_text_from_file(uploaded_file)
+        if not file_text.strip():
+            logger.error("Uploaded file contains no valid text.")
+            st.error("The uploaded file contains no valid text.")
+            return
+        chunks = chunk_text(file_text)
+        if not chunks:
+            logger.error("No chunks generated from text.")
+            st.error("Failed to split text into chunks.")
+            return
+        embeddings = [generate_embedding(chunk) for chunk in chunks]
+        if all(np.all(embedding == 0) for embedding in embeddings):
+            logger.error("All embeddings are zero vectors.")
+            st.error("Failed to generate valid embeddings.")
+            return
+        st.session_state.update({
+            "document_text": file_text,
+            "document_chunks": chunks,
+            "document_embeddings": embeddings
+        })
+        if not st.session_state.get("doc_processed", False):
+            message_placeholder = st.empty()
+            message_placeholder.success("Document processing complete! You can now start chatting.")
+            st.session_state.doc_processed = True
+    except Exception as e:
+        logger.error("Document processing failed: %s", e)
+        st.error(f"An error occurred while processing the document: {e}")
+# -------------------------
+# Retrieve Relevant Chunks
+# -------------------------
+def search_query(query: str) -> list[tuple[str, float]]:
+    if "document_embeddings" not in st.session_state or len(st.session_state["document_embeddings"]) == 0:
+        logger.error("No valid document embeddings found in session state.")
+        st.error("No valid document embeddings found. Please upload a valid document.")
+        return []
+    query_embedding = generate_embedding(query)
+    if np.all(query_embedding == 0):
+        logger.error("Query embedding is a zero vector.")
+        st.error("Failed to generate a valid query embedding.")
+        return []
+    query_embedding = query_embedding.reshape(1, -1)
+    doc_embeddings = np.vstack(st.session_state["document_embeddings"])
+    similarities = cosine_similarity(query_embedding, doc_embeddings)[0]
+    top_indices = np.argsort(similarities)[-Config.TOP_N:][::-1]
+    results = [(st.session_state["document_chunks"][i], similarities[i]) for i in top_indices]
+    return results
+# -------------------------
+# Generate Answer from LLM (RAG)
+# -------------------------
+def generate_answer(user_query: str, context: str) -> str:
+    prompt = (
+        f"System: {Config.SYSTEM_PROMPT}\n\n"
+        f"Context:\n{context}\n\n"
+        f"User: {user_query}\nAssistant:"
+    )
+    try:
+        model = palm.GenerativeModel(Config.GENERATION_MODEL)
+        response = model.generate_content(prompt)
+        if hasattr(response, "text"):
+            return response.text
+        else:
+            return response
+    except Exception as e:
+        logger.error("Failed to generate answer: %s", e)
+        st.error("Failed to generate answer. Please check your input and try again.")
+        return "I'm sorry, I encountered an error generating a response."
+# -------------------------
+# Chat Interface
+# -------------------------
+def chat_app():
+    # Initialize conversation history and session ID if not already set.
+    if "conversations" not in st.session_state:
+        st.session_state.conversations = []  # Each element is a dict with keys: user_question, assistant_answer, (optionally) feedback
+    if "session_id" not in st.session_state:
+        st.session_state.session_id = str(uuid.uuid4())
+    # Display past conversations
+    for conv in st.session_state.conversations:
+        # Display the user's question
+        with st.chat_message("user"):
+            st.write(conv.get("user_question", ""))
+        # Display the assistant's answer
+        with st.chat_message("assistant"):
+            st.write(conv.get("assistant_answer", ""))
+            # Optionally, display feedback if available
+            if conv.get("feedback"):
+                st.markdown(f"**Feedback:** {conv['feedback']}")
+    # Get new user input
+    user_input = st.chat_input("Type your message here")
+    if user_input:
+        # Display the user input immediately.
+        with st.chat_message("user"):
+            st.write(user_input)
+        # Retrieve relevant document chunks from the processed document.
+        results = search_query(user_input)
+        context = "\n\n".join([chunk for chunk, score in results]) if results else ""
+        # Generate the assistant's answer using the retrieved context.
+        answer = generate_answer(user_input, context)
+        with st.chat_message("assistant"):
+            st.write(answer)
+        # Save the whole conversation (user question + assistant answer) as one document.
+        conversation_id = save_conversation_to_firestore(
+            st.session_state.session_id,
+            user_question=user_input,
+            assistant_answer=answer
+        )
+        st.session_state.latest_conversation_id = conversation_id
+        # Append the conversation to session state (for UI history)
+        st.session_state.conversations.append({
+            "user_question": user_input,
+            "assistant_answer": answer,
+        })
+        # Instead of a radio button, show two buttons for like/dislike.
+        # Only show these buttons if the latest conversation has not yet been rated.
+        if "feedback" not in st.session_state.conversations[-1]:
+            col1, col2,col3,col4,col5,col6,col7,col8,col9,col10 = st.columns(10)
+            col1.button("👍", key=f"feedback_like_{len(st.session_state.conversations)}",
+            on_click=handle_feedback, args=("positive",))
+            col2.button("👎", key=f"feedback_dislike_{len(st.session_state.conversations)}",
+            on_click=handle_feedback, args=("negative",))
+# -------------------------
+# Main Application (Streamlit)
+# -------------------------
+def main():
+    st.title("Code : Beta")
+    st.sidebar.header("Upload Document")
+    uploaded_file = st.sidebar.file_uploader("Upload (.txt, .pdf, .docx)", type=["txt", "pdf", "docx"])
+    # Process the document only if uploaded and not already processed.
+    if uploaded_file and not st.session_state.get("doc_processed", False):
+        process_document(uploaded_file)
+    if "document_text" in st.session_state:
+        chat_app()
+    else:
+        st.info("Please upload and process a document from the sidebar to start chatting.")
+if __name__ == "__main__":
+    main()