Spaces:
Sleeping
Sleeping
| # ======================== | |
| # 📄 streamlit_app.py | |
| # LangChain + Gemini 1.5 Flash without FAISS | |
| # ======================== | |
| import streamlit as st | |
| from PyPDF2 import PdfReader | |
| from docx import Document | |
| from bs4 import BeautifulSoup | |
| import os | |
| from dotenv import load_dotenv | |
| from langchain_google_genai import ChatGoogleGenerativeAI | |
| from langchain.prompts import PromptTemplate | |
| from langchain.chains.question_answering import load_qa_chain | |
| from langchain_core.documents import Document | |
| # ======================== | |
| # 1️⃣ Configuration and Setup | |
| # ======================== | |
| load_dotenv() | |
| GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY") | |
| if not GOOGLE_API_KEY: | |
| st.error("Missing GOOGLE_API_KEY in environment variables.") | |
| st.stop() | |
| # ======================== | |
| # 2️⃣ File Size Limits | |
| # ======================== | |
| MAX_TOTAL_SIZE_MB = 5 | |
| MAX_FILE_SIZE_MB = 2 | |
| def validate_file_sizes(uploaded_files): | |
| total_size = 0 | |
| for file in uploaded_files: | |
| size_mb = file.size / (1024 * 1024) | |
| if size_mb > MAX_FILE_SIZE_MB: | |
| st.warning(f"{file.name} is too large ({size_mb:.2f} MB). Limit is {MAX_FILE_SIZE_MB} MB per file.") | |
| return False | |
| total_size += size_mb | |
| if total_size > MAX_TOTAL_SIZE_MB: | |
| st.warning(f"Total size of all files is {total_size:.2f} MB. Limit is {MAX_TOTAL_SIZE_MB} MB total.") | |
| return False | |
| return True | |
| # ======================== | |
| # 3️⃣ Text Extraction | |
| # ======================== | |
| def get_pdf_text(pdf_docs): | |
| text = "" | |
| for pdf in pdf_docs: | |
| reader = PdfReader(pdf) | |
| for page in reader.pages: | |
| content = page.extract_text() | |
| if content: | |
| text += content | |
| return text | |
| def get_docx_text(docx_file): | |
| doc = Document(docx_file) | |
| return "\n".join([para.text for para in doc.paragraphs]) | |
| def get_html_text(html_file): | |
| content = html_file.read() | |
| soup = BeautifulSoup(content, "html.parser") | |
| return soup.get_text() | |
| # ======================== | |
| # 4️⃣ LangChain Q&A Chain | |
| # ======================== | |
| def get_conversational_chain(): | |
| prompt_template = """ | |
| Answer the question as detailed as possible from the provided context. If the answer is not available, say "answer is not available in the context." | |
| Context: | |
| {context} | |
| Question: | |
| {question} | |
| Answer: | |
| """ | |
| model = ChatGoogleGenerativeAI( | |
| model="gemini-1.5-flash", | |
| temperature=0.3, | |
| google_api_key=GOOGLE_API_KEY | |
| ) | |
| prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"]) | |
| chain = load_qa_chain(model, chain_type="stuff", prompt=prompt) | |
| return chain | |
| # ======================== | |
| # 5️⃣ Streamlit App | |
| # ======================== | |
| def main(): | |
| st.set_page_config(page_title="Gemini Q&A Without FAISS") | |
| st.header("📄 Chat with Uploaded Documents (FAISS-Free Gemini Q&A)") | |
| # Upload and extract | |
| with st.sidebar: | |
| st.title("Upload Files") | |
| uploaded_files = st.file_uploader( | |
| "Upload PDF, DOCX, or HTML files (Max 2MB/file, 5MB total)", | |
| accept_multiple_files=True, | |
| type=['pdf', 'docx', 'html'] | |
| ) | |
| full_text = "" | |
| if st.button("Submit & Extract"): | |
| if not uploaded_files: | |
| st.warning("Please upload at least one file.") | |
| return | |
| if not validate_file_sizes(uploaded_files): | |
| return | |
| with st.spinner("Extracting file content..."): | |
| for file in uploaded_files: | |
| if file.name.endswith(".pdf"): | |
| full_text += get_pdf_text([file]) | |
| elif file.name.endswith(".docx"): | |
| full_text += get_docx_text(file) | |
| elif file.name.endswith(".html"): | |
| full_text += get_html_text(file) | |
| else: | |
| st.warning(f"Unsupported file type: {file.name}") | |
| st.session_state["context_text"] = full_text[:3000] # Limit for Gemini token safety | |
| st.success("Text extracted. You can now ask questions.") | |
| # Ask questions | |
| if "context_text" in st.session_state: | |
| user_question = st.text_input("Ask a question based on the uploaded document:") | |
| if user_question: | |
| with st.spinner("Thinking..."): | |
| try: | |
| chain = get_conversational_chain() | |
| # ✅ Wrap the extracted context text in a Document object | |
| doc = Document(page_content=st.session_state["context_text"]) | |
| # ✅ Pass it using the correct input key | |
| response = chain( | |
| { | |
| "input_documents": [doc], | |
| "question": user_question | |
| }, | |
| return_only_outputs=True | |
| ) | |
| st.markdown(f"**Gemini says:**\n\n{response['output_text']}") | |
| except Exception as e: | |
| st.error(f"Error from Gemini: {e}") | |
| if __name__ == "__main__": | |
| main() | |