Spaces:
Sleeping
Sleeping
| from dotenv import load_dotenv | |
| import os | |
| import streamlit as st | |
| from PyPDF2 import PdfReader | |
| from langchain.text_splitter import CharacterTextSplitter | |
| from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI | |
| from langchain_community.vectorstores import FAISS | |
| from langchain.chains.question_answering import load_qa_chain | |
| from langchain.prompts import PromptTemplate | |
| import tempfile | |
| # Load environment variables | |
| load_dotenv() | |
| # Set Gemini API key | |
| gemini_api_key = "AIzaSyCPNdM86kS3rR91mp7BxZaMolvQ0PqQiBY" | |
| os.environ["GOOGLE_API_KEY"] = gemini_api_key | |
| def get_pdf_text(pdf_files): | |
| """從多個PDF文件中提取文字""" | |
| raw_text = "" | |
| if pdf_files is None: | |
| return raw_text | |
| # 處理單個文件和多個文件 | |
| if not isinstance(pdf_files, list): | |
| pdf_files = [pdf_files] | |
| for pdf in pdf_files: | |
| try: | |
| # 檢查是否為上傳的文件物件或文件路徑 | |
| if hasattr(pdf, 'read'): | |
| # 這是來自Streamlit的上傳文件物件 | |
| with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file: | |
| tmp_file.write(pdf.read()) | |
| tmp_file.flush() | |
| pdf_reader = PdfReader(tmp_file.name) | |
| for page in pdf_reader.pages: | |
| text = page.extract_text() | |
| if text: | |
| raw_text += text + "\n" | |
| # 清理臨時文件 | |
| os.unlink(tmp_file.name) | |
| else: | |
| # 這是文件路徑 | |
| pdf_reader = PdfReader(pdf) | |
| for page in pdf_reader.pages: | |
| text = page.extract_text() | |
| if text: | |
| raw_text += text + "\n" | |
| except Exception as e: | |
| st.error(f"讀取PDF時發生錯誤:{str(e)}") | |
| continue | |
| return raw_text | |
| def get_text_chunks(text): | |
| """將文字分割成區塊進行處理""" | |
| text_splitter = CharacterTextSplitter( | |
| separator="\n", | |
| chunk_size=10000, | |
| chunk_overlap=1000, | |
| length_function=len | |
| ) | |
| chunks = text_splitter.split_text(text) | |
| return chunks | |
| def get_vector_store(chunks): | |
| """從文字區塊創建並保存FAISS向量存儲""" | |
| try: | |
| embeddings = GoogleGenerativeAIEmbeddings( | |
| model="models/text-embedding-004", # Updated to newer embedding model | |
| google_api_key=gemini_api_key | |
| ) | |
| vector_store = FAISS.from_texts(chunks, embeddings) | |
| vector_store.save_local("faiss_index") | |
| return True | |
| except Exception as e: | |
| st.error(f"創建向量存儲時發生錯誤:{str(e)}") | |
| return False | |
| def get_conversational_chain(): | |
| """Create the conversational chain for Q&A with Flash 2.0""" | |
| prompt_template = """ | |
| Answer the question as detailed as possible from the provided context. Make sure to provide all the details. | |
| If you need more details to perfectly answer the question, then ask for more details that you think need to be known. | |
| If the answer is not in the provided context, just say "answer is not available in your provided context". Don't provide the wrong answer. | |
| Context:\n {context}\n | |
| Question: \n{question}\n | |
| Answer: | |
| """ | |
| # Using Flash 2.0 model | |
| model = ChatGoogleGenerativeAI( | |
| model="gemini-2.0-flash-exp", # Flash 2.0 model | |
| google_api_key=gemini_api_key, | |
| temperature=0.3, | |
| max_tokens=8192, # Flash 2.0 supports larger context | |
| top_p=0.8, | |
| top_k=40 | |
| ) | |
| prompt = PromptTemplate( | |
| template=prompt_template, | |
| input_variables=['context', 'question'] | |
| ) | |
| chain = load_qa_chain(model, chain_type="stuff", prompt=prompt) | |
| return chain | |
| def handle_user_input(question): | |
| """Handle user questions and provide answers""" | |
| try: | |
| # Check if vector store exists | |
| if not os.path.exists("faiss_index"): | |
| st.warning("Please upload and process PDF files first!") | |
| return | |
| # Load the vector store with updated embedding model | |
| embeddings = GoogleGenerativeAIEmbeddings( | |
| model="models/text-embedding-004", # Updated to newer embedding model | |
| google_api_key=gemini_api_key | |
| ) | |
| vector_store = FAISS.load_local( | |
| "faiss_index", | |
| embeddings=embeddings, | |
| allow_dangerous_deserialization=True | |
| ) | |
| # Search for similar documents (increased k for Flash 2.0's better context handling) | |
| docs = vector_store.similarity_search(question, k=6) | |
| if not docs: | |
| st.write("No relevant information found in the uploaded documents.") | |
| return | |
| # Get the conversational chain and generate response | |
| chain = get_conversational_chain() | |
| response = chain( | |
| { | |
| "input_documents": docs, | |
| "question": question, | |
| }, | |
| return_only_outputs=True | |
| ) | |
| st.write("**Reply (Flash 2.0):**") | |
| st.write(response["output_text"]) | |
| except Exception as e: | |
| st.error(f"Error processing question: {str(e)}") | |
| def main(): | |
| """Main Streamlit application""" | |
| st.set_page_config( | |
| page_title="Chat with Multiple PDFs - Flash 2.0", | |
| page_icon="⚡", | |
| layout="wide" | |
| ) | |
| st.header("⚡ Chat With Multiple PDFs using Flash 2.0") | |
| st.markdown("Upload your PDF files and ask questions about their content using Google's latest Flash 2.0 model!") | |
| # Model info badge | |
| st.markdown(""" | |
| <div style="background-color: #e8f4f8; padding: 10px; border-radius: 5px; margin-bottom: 20px;"> | |
| <strong>🚀 Powered by Flash 2.0</strong> - Google's fastest and most efficient model with enhanced reasoning capabilities | |
| </div> | |
| """, unsafe_allow_html=True) | |
| # Create two columns for better layout | |
| col1, col2 = st.columns([2, 1]) | |
| with col1: | |
| # User question input | |
| user_question = st.text_input( | |
| "🔍 Ask a question about your PDF files:", | |
| placeholder="e.g., What is the main topic of the document?" | |
| ) | |
| if user_question: | |
| with st.spinner("Flash 2.0 is processing your question..."): | |
| handle_user_input(user_question) | |
| with col2: | |
| st.markdown("### 📄 Upload PDFs") | |
| # File uploader for multiple PDFs | |
| pdf_docs = st.file_uploader( | |
| "Choose PDF files", | |
| accept_multiple_files=True, | |
| type="pdf" | |
| ) | |
| if pdf_docs: | |
| st.success(f"✅ {len(pdf_docs)} PDF file(s) uploaded") | |
| if st.button("🔄 Process PDFs", type="primary"): | |
| with st.spinner("Processing PDFs with Flash 2.0..."): | |
| progress_bar = st.progress(0) | |
| # Extract text from all PDFs | |
| progress_bar.progress(25) | |
| raw_text = get_pdf_text(pdf_docs) | |
| if not raw_text.strip(): | |
| st.error("No text could be extracted from the PDF files.") | |
| return | |
| # Split text into chunks | |
| progress_bar.progress(50) | |
| text_chunks = get_text_chunks(raw_text) | |
| # Create vector store | |
| progress_bar.progress(75) | |
| success = get_vector_store(text_chunks) | |
| progress_bar.progress(100) | |
| if success: | |
| st.success("✅ PDFs processed successfully! You can now ask questions.") | |
| st.info(f"📊 Processed {len(text_chunks)} text chunks from your documents.") | |
| else: | |
| st.error("Failed to process PDFs. Please try again.") | |
| # Sidebar with information | |
| with st.sidebar: | |
| st.markdown("### ℹ️ How to use:") | |
| st.markdown(""" | |
| 1. **Upload PDFs**: Click 'Choose PDF files' and select one or more PDF files | |
| 2. **Process**: Click 'Process PDFs' to analyze your documents | |
| 3. **Ask Questions**: Type your questions in the search box | |
| 4. **Get Answers**: Flash 2.0 will provide fast, accurate answers based on your documents | |
| """) | |
| st.markdown("### ⚡ Flash 2.0 Features:") | |
| st.markdown(""" | |
| - ⚡ **Ultra-fast responses** - 2x faster than Gemini Pro | |
| - 🧠 **Enhanced reasoning** - Better understanding of complex queries | |
| - 📈 **Improved accuracy** - More precise answers from documents | |
| - 🔄 **Better context handling** - Processes more relevant information | |
| - 💰 **Cost efficient** - Lower API costs per query | |
| """) | |
| st.markdown("### 🔧 Technical Features:") | |
| st.markdown(""" | |
| - ✅ Multiple PDF support | |
| - 🤖 AI-powered Q&A with Flash 2.0 | |
| - 🔍 Advanced semantic search | |
| - 📊 Optimized text chunking | |
| - 🎯 Improved embedding model (text-embedding-004) | |
| """) | |
| if os.path.exists("faiss_index"): | |
| if st.button("🗑️ Clear Processed Data"): | |
| try: | |
| import shutil | |
| shutil.rmtree("faiss_index") | |
| st.success("Cleared processed data!") | |
| st.experimental_rerun() | |
| except Exception as e: | |
| st.error(f"Error clearing data: {str(e)}") | |
| if __name__ == "__main__": | |
| main() |