Spaces:

Xindus
/

chatpdf-rafeeq

Sleeping

App Files Files Community

chatpdf-rafeeq / streamlit_app.py

Deeksha14

Upload streamlit_app.py

6eaf229 verified 8 months ago

raw

history blame contribute delete

5.18 kB

	# ========================
	# 📄 streamlit_app.py
	# LangChain + Gemini 1.5 Flash without FAISS
	# ========================

	import streamlit as st
	from PyPDF2 import PdfReader
	from docx import Document
	from bs4 import BeautifulSoup
	import os
	from dotenv import load_dotenv
	from langchain_google_genai import ChatGoogleGenerativeAI
	from langchain.prompts import PromptTemplate
	from langchain.chains.question_answering import load_qa_chain
	from langchain_core.documents import Document

	# ========================
	# 1️⃣ Configuration and Setup
	# ========================

	load_dotenv()
	GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

	if not GOOGLE_API_KEY:
	st.error("Missing GOOGLE_API_KEY in environment variables.")
	st.stop()

	# ========================
	# 2️⃣ File Size Limits
	# ========================

	MAX_TOTAL_SIZE_MB = 5
	MAX_FILE_SIZE_MB = 2

	def validate_file_sizes(uploaded_files):
	total_size = 0
	for file in uploaded_files:
	size_mb = file.size / (1024 * 1024)
	if size_mb > MAX_FILE_SIZE_MB:
	st.warning(f"{file.name} is too large ({size_mb:.2f} MB). Limit is {MAX_FILE_SIZE_MB} MB per file.")
	return False
	total_size += size_mb
	if total_size > MAX_TOTAL_SIZE_MB:
	st.warning(f"Total size of all files is {total_size:.2f} MB. Limit is {MAX_TOTAL_SIZE_MB} MB total.")
	return False
	return True

	# ========================
	# 3️⃣ Text Extraction
	# ========================

	def get_pdf_text(pdf_docs):
	text = ""
	for pdf in pdf_docs:
	reader = PdfReader(pdf)
	for page in reader.pages:
	content = page.extract_text()
	if content:
	text += content
	return text

	def get_docx_text(docx_file):
	doc = Document(docx_file)
	return "\n".join([para.text for para in doc.paragraphs])

	def get_html_text(html_file):
	content = html_file.read()
	soup = BeautifulSoup(content, "html.parser")
	return soup.get_text()

	# ========================
	# 4️⃣ LangChain Q&A Chain
	# ========================

	def get_conversational_chain():
	prompt_template = """
	Answer the question as detailed as possible from the provided context. If the answer is not available, say "answer is not available in the context."

	Context:
	{context}

	Question:
	{question}

	Answer:
	"""
	model = ChatGoogleGenerativeAI(
	model="gemini-1.5-flash",
	temperature=0.3,
	google_api_key=GOOGLE_API_KEY
	)
	prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
	chain = load_qa_chain(model, chain_type="stuff", prompt=prompt)
	return chain

	# ========================
	# 5️⃣ Streamlit App
	# ========================

	def main():
	st.set_page_config(page_title="Gemini Q&A Without FAISS")
	st.header("📄 Chat with Uploaded Documents (FAISS-Free Gemini Q&A)")

	# Upload and extract
	with st.sidebar:
	st.title("Upload Files")
	uploaded_files = st.file_uploader(
	"Upload PDF, DOCX, or HTML files (Max 2MB/file, 5MB total)",
	accept_multiple_files=True,
	type=['pdf', 'docx', 'html']
	)

	full_text = ""
	if st.button("Submit & Extract"):
	if not uploaded_files:
	st.warning("Please upload at least one file.")
	return

	if not validate_file_sizes(uploaded_files):
	return

	with st.spinner("Extracting file content..."):
	for file in uploaded_files:
	if file.name.endswith(".pdf"):
	full_text += get_pdf_text([file])
	elif file.name.endswith(".docx"):
	full_text += get_docx_text(file)
	elif file.name.endswith(".html"):
	full_text += get_html_text(file)
	else:
	st.warning(f"Unsupported file type: {file.name}")

	st.session_state["context_text"] = full_text[:3000] # Limit for Gemini token safety
	st.success("Text extracted. You can now ask questions.")

	# Ask questions
	if "context_text" in st.session_state:
	user_question = st.text_input("Ask a question based on the uploaded document:")
	if user_question:
	with st.spinner("Thinking..."):
	try:
	chain = get_conversational_chain()

	# ✅ Wrap the extracted context text in a Document object
	doc = Document(page_content=st.session_state["context_text"])

	# ✅ Pass it using the correct input key
	response = chain(
	{
	"input_documents": [doc],
	"question": user_question
	},
	return_only_outputs=True
	)

	st.markdown(f"Gemini says:\n\n{response['output_text']}")

	except Exception as e:
	st.error(f"Error from Gemini: {e}")

	if __name__ == "__main__":
	main()