chatpdf-rafeeq / streamlit_app.py
Deeksha14's picture
Upload streamlit_app.py
6eaf229 verified
# ========================
# 📄 streamlit_app.py
# LangChain + Gemini 1.5 Flash without FAISS
# ========================
import streamlit as st
from PyPDF2 import PdfReader
from docx import Document
from bs4 import BeautifulSoup
import os
from dotenv import load_dotenv
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.prompts import PromptTemplate
from langchain.chains.question_answering import load_qa_chain
from langchain_core.documents import Document
# ========================
# 1️⃣ Configuration and Setup
# ========================
load_dotenv()
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
if not GOOGLE_API_KEY:
st.error("Missing GOOGLE_API_KEY in environment variables.")
st.stop()
# ========================
# 2️⃣ File Size Limits
# ========================
MAX_TOTAL_SIZE_MB = 5
MAX_FILE_SIZE_MB = 2
def validate_file_sizes(uploaded_files):
total_size = 0
for file in uploaded_files:
size_mb = file.size / (1024 * 1024)
if size_mb > MAX_FILE_SIZE_MB:
st.warning(f"{file.name} is too large ({size_mb:.2f} MB). Limit is {MAX_FILE_SIZE_MB} MB per file.")
return False
total_size += size_mb
if total_size > MAX_TOTAL_SIZE_MB:
st.warning(f"Total size of all files is {total_size:.2f} MB. Limit is {MAX_TOTAL_SIZE_MB} MB total.")
return False
return True
# ========================
# 3️⃣ Text Extraction
# ========================
def get_pdf_text(pdf_docs):
text = ""
for pdf in pdf_docs:
reader = PdfReader(pdf)
for page in reader.pages:
content = page.extract_text()
if content:
text += content
return text
def get_docx_text(docx_file):
doc = Document(docx_file)
return "\n".join([para.text for para in doc.paragraphs])
def get_html_text(html_file):
content = html_file.read()
soup = BeautifulSoup(content, "html.parser")
return soup.get_text()
# ========================
# 4️⃣ LangChain Q&A Chain
# ========================
def get_conversational_chain():
prompt_template = """
Answer the question as detailed as possible from the provided context. If the answer is not available, say "answer is not available in the context."
Context:
{context}
Question:
{question}
Answer:
"""
model = ChatGoogleGenerativeAI(
model="gemini-1.5-flash",
temperature=0.3,
google_api_key=GOOGLE_API_KEY
)
prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
chain = load_qa_chain(model, chain_type="stuff", prompt=prompt)
return chain
# ========================
# 5️⃣ Streamlit App
# ========================
def main():
st.set_page_config(page_title="Gemini Q&A Without FAISS")
st.header("📄 Chat with Uploaded Documents (FAISS-Free Gemini Q&A)")
# Upload and extract
with st.sidebar:
st.title("Upload Files")
uploaded_files = st.file_uploader(
"Upload PDF, DOCX, or HTML files (Max 2MB/file, 5MB total)",
accept_multiple_files=True,
type=['pdf', 'docx', 'html']
)
full_text = ""
if st.button("Submit & Extract"):
if not uploaded_files:
st.warning("Please upload at least one file.")
return
if not validate_file_sizes(uploaded_files):
return
with st.spinner("Extracting file content..."):
for file in uploaded_files:
if file.name.endswith(".pdf"):
full_text += get_pdf_text([file])
elif file.name.endswith(".docx"):
full_text += get_docx_text(file)
elif file.name.endswith(".html"):
full_text += get_html_text(file)
else:
st.warning(f"Unsupported file type: {file.name}")
st.session_state["context_text"] = full_text[:3000] # Limit for Gemini token safety
st.success("Text extracted. You can now ask questions.")
# Ask questions
if "context_text" in st.session_state:
user_question = st.text_input("Ask a question based on the uploaded document:")
if user_question:
with st.spinner("Thinking..."):
try:
chain = get_conversational_chain()
# ✅ Wrap the extracted context text in a Document object
doc = Document(page_content=st.session_state["context_text"])
# ✅ Pass it using the correct input key
response = chain(
{
"input_documents": [doc],
"question": user_question
},
return_only_outputs=True
)
st.markdown(f"**Gemini says:**\n\n{response['output_text']}")
except Exception as e:
st.error(f"Error from Gemini: {e}")
if __name__ == "__main__":
main()