Spaces:

Deeksha14
/

chatpdf-final

Sleeping

App Files Files Community

Deeksha commited on Jun 18, 2025

Commit

fa9d3ad

0 Parent(s):

Initial clean commit for Hugging Face deployment

Browse files

Files changed (5) hide show

.DS_Store +0 -0
.gitignore +5 -0
Dockerfile +23 -0
requirements.txt +10 -0
streamlit_app.py +156 -0

.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

.gitignore ADDED Viewed

	@@ -0,0 +1,5 @@

+__pycache__/
+*.pyc
+venv/
+.env
+faiss_index/

Dockerfile ADDED Viewed

	@@ -0,0 +1,23 @@

+# Use official lightweight Python image
+FROM python:3.10-slim
+# Set environment variables to disable usage stats collection (to prevent write errors)
+ENV STREAMLIT_BROWSER_GATHERUSAGESTATS=false
+ENV STREAMLIT_DISABLE_WATCHDOG_WARNINGS=true
+ENV STREAMLIT_SERVER_HEADLESS=true
+ENV STREAMLIT_SERVER_PORT=7860
+ENV STREAMLIT_SERVER_ADDRESS=0.0.0.0
+ENV HOME=/tmp
+# Set working directory
+WORKDIR /app
+# Copy requirements and install
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy the rest of the code
+COPY . .
+# Run the app
+CMD ["streamlit", "run", "chatpdf1.py"]

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+streamlit
+google-generativeai
+python-dotenv
+langchain
+langchain-community
+langchain-google-genai
+faiss-cpu
+PyPDF2
+python-docx
+beautifulsoup4

streamlit_app.py ADDED Viewed

	@@ -0,0 +1,156 @@

+import streamlit as st
+from PyPDF2 import PdfReader
+from docx import Document
+from bs4 import BeautifulSoup
+import os
+import google.generativeai as genai
+from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
+from langchain_community.vectorstores import FAISS
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.chains.question_answering import load_qa_chain
+from langchain.prompts import PromptTemplate
+from dotenv import load_dotenv
+# ========================
+# 1️⃣ Configuration
+# ========================
+# Load environment variables and API key
+load_dotenv()
+api_key = os.getenv("GOOGLE_API_KEY")
+if not api_key:
+    st.error("GOOGLE_API_KEY not found. Please add it to your .env file.")
+    st.stop()
+genai.configure(api_key=api_key)
+# ========================
+# 2️⃣ File Size Limits
+# ========================
+MAX_TOTAL_SIZE_MB = 5
+MAX_FILE_SIZE_MB = 2
+def validate_file_sizes(uploaded_files):
+    total_size = 0
+    for file in uploaded_files:
+        size_mb = file.size / (1024 * 1024)
+        if size_mb > MAX_FILE_SIZE_MB:
+            st.warning(f"{file.name} is too large ({size_mb:.2f} MB). Limit is {MAX_FILE_SIZE_MB} MB per file.")
+            return False
+        total_size += size_mb
+    if total_size > MAX_TOTAL_SIZE_MB:
+        st.warning(f"Total size of uploaded files is {total_size:.2f} MB. Limit is {MAX_TOTAL_SIZE_MB} MB in total.")
+        return False
+    return True
+# ========================
+# 3️⃣ Text Extraction Functions
+# ========================
+def get_pdf_text(pdf_docs):
+    text = ""
+    for pdf in pdf_docs:
+        pdf_reader = PdfReader(pdf)
+        for page in pdf_reader.pages:
+            content = page.extract_text()
+            if content:
+                text += content
+    return text
+def get_docx_text(docx_file):
+    doc = Document(docx_file)
+    return "\n".join([para.text for para in doc.paragraphs])
+def get_html_text(html_file):
+    content = html_file.read()
+    soup = BeautifulSoup(content, "html.parser")
+    return soup.get_text()
+# ========================
+# 4️⃣ Text Chunking and Vector Store
+# ========================
+def get_text_chunks(text):
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200)
+    return text_splitter.split_text(text)
+def get_vector_store(text_chunks):
+    embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
+    vector_store = FAISS.from_texts(text_chunks, embedding=embeddings)
+    vector_store.save_local("faiss_index")
+# ========================
+# 5️⃣ Conversational Chain Setup
+# ========================
+def get_conversational_chain():
+    prompt_template = """
+    Answer the question as detailed as possible from the provided context. If the answer is not available, say "answer is not available in the context."
+    Context:
+    {context}
+    Question:
+    {question}
+    Answer:
+    """
+    model = ChatGoogleGenerativeAI(model="gemini-1.5-flash", temperature=0.3)
+    prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
+    chain = load_qa_chain(model, chain_type="stuff", prompt=prompt)
+    return chain
+def user_input(user_question):
+    embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
+    new_db = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)
+    docs = new_db.similarity_search(user_question)
+    chain = get_conversational_chain()
+    response = chain({"input_documents": docs, "question": user_question}, return_only_outputs=True)
+    st.write("Reply:", response["output_text"])
+# ========================
+# 6️⃣ Streamlit App Layout
+# ========================
+def main():
+    st.set_page_config(page_title="Chat with Documents")
+    st.header("Chat with your PDF, DOCX, or HTML using Gemini 💬")
+    user_question = st.text_input("Ask a question about your uploaded files:")
+    if user_question:
+        user_input(user_question)
+    with st.sidebar:
+        st.title("Upload & Process Files")
+        uploaded_files = st.file_uploader("Upload PDF, DOCX, or HTML files", accept_multiple_files=True, type=['pdf', 'docx', 'html'])
+        if st.button("Submit & Process"):
+            if not uploaded_files:
+                st.warning("Please upload at least one file.")
+                return
+            if not validate_file_sizes(uploaded_files):
+                return
+            with st.spinner("Processing files..."):
+                full_text = ""
+                for file in uploaded_files:
+                    if file.name.endswith(".pdf"):
+                        full_text += get_pdf_text([file])
+                    elif file.name.endswith(".docx"):
+                        full_text += get_docx_text(file)
+                    elif file.name.endswith(".html"):
+                        full_text += get_html_text(file)
+                    else:
+                        st.warning(f"Unsupported file type: {file.name}")
+                text_chunks = get_text_chunks(full_text)
+                get_vector_store(text_chunks)
+                st.success("Processing complete!")
+if __name__ == "__main__":
+    main()