Spaces:

sreebhargav
/

chatwithpdfs

Sleeping

App Files Files Community

sreebhargav commited on Jul 14, 2025

Commit

e747eba

verified ·

1 Parent(s): abc7493

Update app.py

Browse files

Files changed (1) hide show

app.py +14 -27

app.py CHANGED Viewed

@@ -9,16 +9,11 @@ from langchain_google_genai import ChatGoogleGenerativeAI
 from langchain.chains.question_answering import load_qa_chain
 from langchain.prompts import PromptTemplate
 from dotenv import load_dotenv
-from pdf2image import convert_from_bytes
-import pytesseract
-from io import BytesIO
 # Load API key
 load_dotenv()
 genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
-os.environ["STREAMLIT_CONFIG_DIR"] = "/app/.streamlit"
 # Inject CSS for chat bubbles
 st.markdown("""
     <style>
@@ -43,38 +38,27 @@ st.markdown("""
     </style>
 """, unsafe_allow_html=True)
-# Extract text (with OCR fallback)
 def get_pdf_text(pdf_docs):
-    all_text = ""
     for pdf in pdf_docs:
-        pdf_bytes = pdf.read()
-        pdf_reader = PdfReader(BytesIO(pdf_bytes))
-        text = ""
         for page in pdf_reader.pages:
             page_text = page.extract_text()
             if page_text:
                 text += page_text
-        if not text.strip():
-            st.warning(f"OCR applied to '{pdf.name}' (scanned or image-based PDF).")
-            images = convert_from_bytes(pdf_bytes)
-            for i, img in enumerate(images):
-                st.image(img, caption=f"OCR Page {i+1}", use_column_width=True)
-                text += pytesseract.image_to_string(img)
-        all_text += text
-    return all_text
 def get_text_chunks(text):
     text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
-    return text_splitter.split_text(text)
 def get_vector_store(text_chunks):
     if not text_chunks:
         raise ValueError("No text chunks to embed. Check if your PDF contains extractable text.")
     embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
-    return FAISS.from_texts(text_chunks, embedding=embeddings)
 def get_conversational_chain():
     prompt_template = """
@@ -87,18 +71,20 @@ def get_conversational_chain():
     """
     model = ChatGoogleGenerativeAI(model="models/gemini-2.0-flash", temperature=0.3)
     prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
-    return load_qa_chain(model, chain_type="stuff", prompt=prompt)
 def display_chat(user_msg, bot_msg):
     st.markdown(f"<div class='chat-bubble user'>{user_msg}</div>", unsafe_allow_html=True)
     st.markdown(f"<div class='chat-bubble bot'>{bot_msg}</div>", unsafe_allow_html=True)
 def main():
-    st.set_page_config(page_title="Chat with PDFs (Text + Scanned)", layout="wide")
-    st.title("📚 Chat with Your PDFs using Gemini + OCR")
     col1, col2 = st.columns([1, 2], gap="large")
     with col1:
         st.header("📁 Upload & Process")
         pdf_docs = st.file_uploader("Upload PDF files", type=["pdf"], accept_multiple_files=True)
@@ -111,7 +97,7 @@ def main():
                 raw_text = get_pdf_text(pdf_docs)
                 if not raw_text.strip():
-                    st.error("❗ No text could be extracted, even with OCR.")
                     return
                 text_chunks = get_text_chunks(raw_text)
@@ -129,6 +115,7 @@ def main():
                 except Exception as e:
                     st.error(f"❗ Error creating vector store: {str(e)}")
     with col2:
         st.header("💬 Ask Questions")
         user_question = st.text_input("Type your question here...")

 from langchain.chains.question_answering import load_qa_chain
 from langchain.prompts import PromptTemplate
 from dotenv import load_dotenv
 # Load API key
 load_dotenv()
 genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
 # Inject CSS for chat bubbles
 st.markdown("""
     <style>
     </style>
 """, unsafe_allow_html=True)
 def get_pdf_text(pdf_docs):
+    text = ""
     for pdf in pdf_docs:
+        pdf_reader = PdfReader(pdf)
         for page in pdf_reader.pages:
             page_text = page.extract_text()
             if page_text:
                 text += page_text
+    return text
 def get_text_chunks(text):
     text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
+    chunks = text_splitter.split_text(text)
+    return chunks
 def get_vector_store(text_chunks):
     if not text_chunks:
         raise ValueError("No text chunks to embed. Check if your PDF contains extractable text.")
     embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
+    vector_store = FAISS.from_texts(text_chunks, embedding=embeddings)
+    return vector_store
 def get_conversational_chain():
     prompt_template = """
     """
     model = ChatGoogleGenerativeAI(model="models/gemini-2.0-flash", temperature=0.3)
     prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
+    chain = load_qa_chain(model, chain_type="stuff", prompt=prompt)
+    return chain
 def display_chat(user_msg, bot_msg):
     st.markdown(f"<div class='chat-bubble user'>{user_msg}</div>", unsafe_allow_html=True)
     st.markdown(f"<div class='chat-bubble bot'>{bot_msg}</div>", unsafe_allow_html=True)
 def main():
+    st.set_page_config(page_title="Chat with PDFs", layout="wide")
+    st.title("📚 Chat with Your PDFs using Gemini")
     col1, col2 = st.columns([1, 2], gap="large")
+    # LEFT: Upload PDFs
     with col1:
         st.header("📁 Upload & Process")
         pdf_docs = st.file_uploader("Upload PDF files", type=["pdf"], accept_multiple_files=True)
                 raw_text = get_pdf_text(pdf_docs)
                 if not raw_text.strip():
+                    st.error("❗ No extractable text found in the uploaded PDFs. They might be scanned images.")
                     return
                 text_chunks = get_text_chunks(raw_text)
                 except Exception as e:
                     st.error(f"❗ Error creating vector store: {str(e)}")
+    # RIGHT: Ask Questions
     with col2:
         st.header("💬 Ask Questions")
         user_question = st.text_input("Type your question here...")