Spaces:

Xindus
/

chatpdf-rafeeq

Sleeping

App Files Files Community

Deeksha14 commited on Jun 25, 2025

Commit

9f0715f

verified ·

1 Parent(s): f5e88eb

Upload 5 files

Browse files

Files changed (3) hide show

gitattributes +35 -0
requirements.txt +3 -1
streamlit_app.py +28 -48

gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

requirements.txt CHANGED Viewed

@@ -7,4 +7,6 @@ langchain-google-genai
 faiss-cpu
 PyPDF2
 python-docx
-beautifulsoup4

 faiss-cpu
 PyPDF2
 python-docx
+beautifulsoup4
+pinecone-client

streamlit_app.py CHANGED Viewed

@@ -1,3 +1,7 @@
 import streamlit as st
 from PyPDF2 import PdfReader
 from docx import Document
@@ -5,24 +9,32 @@ from bs4 import BeautifulSoup
 import os
 import google.generativeai as genai
 from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
-from langchain_community.vectorstores import FAISS
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain.chains.question_answering import load_qa_chain
 from langchain.prompts import PromptTemplate
 from dotenv import load_dotenv
 # ========================
 # 1️⃣ Configuration
 # ========================
-# Load environment variables and API key
 load_dotenv()
 api_key = os.getenv("GOOGLE_API_KEY")
-if not api_key:
-    st.error("GOOGLE_API_KEY not found. Please add it to your .env file.")
     st.stop()
 genai.configure(api_key=api_key)
 # ========================
 # 2️⃣ File Size Limits
 # ========================
@@ -37,15 +49,13 @@ def validate_file_sizes(uploaded_files):
             st.warning(f"{file.name} is too large ({size_mb:.2f} MB). Limit is {MAX_FILE_SIZE_MB} MB per file.")
             return False
         total_size += size_mb
     if total_size > MAX_TOTAL_SIZE_MB:
         st.warning(f"Total size of uploaded files is {total_size:.2f} MB. Limit is {MAX_TOTAL_SIZE_MB} MB in total.")
         return False
     return True
 # ========================
-# 3️⃣ Text Extraction Functions
 # ========================
 def get_pdf_text(pdf_docs):
     text = ""
@@ -67,26 +77,18 @@ def get_html_text(html_file):
     return soup.get_text()
 # ========================
-# 4️⃣ Text Chunking and Vector Store
 # ========================
 def get_text_chunks(text):
     text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200)
     return text_splitter.split_text(text)
-def get_vector_store(text_chunks):
     embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
-    vector_store = FAISS.from_texts(text_chunks, embedding=embeddings)
-    # ✅ Save to Hugging Face's writable tmp directory
-    save_path = "/tmp/faiss_index"
-    vector_store.save_local(save_path)
-    return vector_store
 # ========================
-# 5️⃣ Conversational Chain Setup
 # ========================
 def get_conversational_chain():
     prompt_template = """
@@ -106,48 +108,27 @@ def get_conversational_chain():
     return chain
 def user_input(user_question):
-    save_path = "/tmp/faiss_index"
-    if not os.path.exists(f"{save_path}/index.faiss"):
-        st.error("Vector index not found. Please upload and process documents first.")
-        return
     embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
-    new_db = FAISS.load_local(save_path, embeddings, allow_dangerous_deserialization=True)
-    docs = new_db.similarity_search(user_question)
     chain = get_conversational_chain()
     response = chain({"input_documents": docs, "question": user_question}, return_only_outputs=True)
     st.write("Reply:", response["output_text"])
 # ========================
-# 6️⃣ Streamlit App Layout
 # ========================
 def main():
     st.set_page_config(page_title="Chat with Documents")
-    st.header("Chat with your PDF, DOCX, or HTML using Gemini 💬")
     user_question = st.text_input("Ask a question about your uploaded files:")
     if user_question:
-        if os.path.exists("/tmp/faiss_index/index.faiss"):
-            user_input(user_question)
-        else:
-            st.warning("Please upload and process documents before asking a question.")
     with st.sidebar:
         st.title("Upload & Process Files")
-        uploaded_files = st.file_uploader(
-            "Upload PDF, DOCX, or HTML files (Max 2MB per file, 5MB total)",  # ✅ Custom message added here
-            accept_multiple_files=True,
-            type=['pdf', 'docx', 'html']
-        )
         if st.button("Submit & Process"):
             if not uploaded_files:
@@ -168,9 +149,8 @@ def main():
                         full_text += get_html_text(file)
                     else:
                         st.warning(f"Unsupported file type: {file.name}")
                 text_chunks = get_text_chunks(full_text)
-                get_vector_store(text_chunks)
                 st.success("Processing complete!")
 if __name__ == "__main__":

+# ========================
+# 📄 streamlit_app.py
+# Now using Pinecone instead of FAISS
+# ========================
 import streamlit as st
 from PyPDF2 import PdfReader
 from docx import Document
 import os
 import google.generativeai as genai
 from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
+from langchain.vectorstores import Pinecone
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain.chains.question_answering import load_qa_chain
 from langchain.prompts import PromptTemplate
 from dotenv import load_dotenv
+import pinecone
 # ========================
 # 1️⃣ Configuration
 # ========================
 load_dotenv()
 api_key = os.getenv("GOOGLE_API_KEY")
+pinecone_api_key = os.getenv("PINECONE_API_KEY")
+pinecone_env = os.getenv("PINECONE_ENV")  # Example: "gcp-starter"
+if not api_key or not pinecone_api_key:
+    st.error("Missing API key(s). Please check your .env settings.")
     st.stop()
+# Init Gemini
 genai.configure(api_key=api_key)
+# Init Pinecone
+pinecone.init(api_key=pinecone_api_key, environment=pinecone_env)
+index_name = "document-chat"  # ✅ Must match what you created
 # ========================
 # 2️⃣ File Size Limits
 # ========================
             st.warning(f"{file.name} is too large ({size_mb:.2f} MB). Limit is {MAX_FILE_SIZE_MB} MB per file.")
             return False
         total_size += size_mb
     if total_size > MAX_TOTAL_SIZE_MB:
         st.warning(f"Total size of uploaded files is {total_size:.2f} MB. Limit is {MAX_TOTAL_SIZE_MB} MB in total.")
         return False
     return True
 # ========================
+# 3️⃣ Text Extraction
 # ========================
 def get_pdf_text(pdf_docs):
     text = ""
     return soup.get_text()
 # ========================
+# 4️⃣ Chunking + Pinecone
 # ========================
 def get_text_chunks(text):
     text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200)
     return text_splitter.split_text(text)
+def push_to_pinecone(chunks):
     embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
+    Pinecone.from_texts(texts=chunks, embedding=embeddings, index_name=index_name)
 # ========================
+# 5️⃣ Q&A Chain
 # ========================
 def get_conversational_chain():
     prompt_template = """
     return chain
 def user_input(user_question):
     embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
+    vectorstore = Pinecone.from_existing_index(index_name=index_name, embedding=embeddings)
+    docs = vectorstore.similarity_search(user_question)
     chain = get_conversational_chain()
     response = chain({"input_documents": docs, "question": user_question}, return_only_outputs=True)
     st.write("Reply:", response["output_text"])
 # ========================
+# 6️⃣ Streamlit UI
 # ========================
 def main():
     st.set_page_config(page_title="Chat with Documents")
+    st.header("Chat with your PDF, DOCX, or HTML using Gemini + Pinecone")
     user_question = st.text_input("Ask a question about your uploaded files:")
     if user_question:
+        user_input(user_question)
     with st.sidebar:
         st.title("Upload & Process Files")
+        uploaded_files = st.file_uploader("Upload PDF, DOCX, or HTML files (Max 2MB per file, 5MB total)", accept_multiple_files=True, type=['pdf', 'docx', 'html'])
         if st.button("Submit & Process"):
             if not uploaded_files:
                         full_text += get_html_text(file)
                     else:
                         st.warning(f"Unsupported file type: {file.name}")
                 text_chunks = get_text_chunks(full_text)
+                push_to_pinecone(text_chunks)
                 st.success("Processing complete!")
 if __name__ == "__main__":