import streamlit as st import os import tempfile from typing import List, Optional import pickle # Core libraries from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline from langchain.llms import HuggingFacePipeline from langchain.embeddings import HuggingFaceEmbeddings from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.schema import Document from langchain import PromptTemplate from langchain.chains import RetrievalQA from langchain.vectorstores import FAISS # Document loaders from langchain.document_loaders import PyPDFLoader # Configure Streamlit page st.set_page_config( page_title="PDF RAG System", page_icon="📚", layout="wide", initial_sidebar_state="expanded" ) # Custom CSS for better styling st.markdown(""" """, unsafe_allow_html=True) # Initialize session state if 'qa_chain' not in st.session_state: st.session_state.qa_chain = None if 'vectorstore' not in st.session_state: st.session_state.vectorstore = None if 'documents_processed' not in st.session_state: st.session_state.documents_processed = False if 'chat_history' not in st.session_state: st.session_state.chat_history = [] @st.cache_resource def setup_llm(model_name="google/flan-t5-small"): """Setup the language model for text generation""" with st.spinner("🤖 Loading language model..."): try: tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForSeq2SeqLM.from_pretrained(model_name) pipe = pipeline( "text2text-generation", model=model, tokenizer=tokenizer, max_new_tokens=300, temperature=0.3, do_sample=True, device=-1 ) llm = HuggingFacePipeline(pipeline=pipe) return llm except Exception as e: st.error(f"Error loading model: {e}") return None @st.cache_resource def setup_embeddings(model_name="all-MiniLM-L6-v2"): """Setup the embedding model for vector generation""" with st.spinner("🔢 Loading embedding model..."): try: embeddings = HuggingFaceEmbeddings(model_name=model_name) return embeddings except Exception as e: st.error(f"Error loading embeddings: {e}") return None def process_uploaded_files(uploaded_files, embeddings): """Process uploaded PDF files and create FAISS vector store""" if not uploaded_files: return None, [] documents = [] # Process each uploaded file for uploaded_file in uploaded_files: try: # Create temporary file with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file: tmp_file.write(uploaded_file.read()) tmp_file_path = tmp_file.name # Load PDF loader = PyPDFLoader(tmp_file_path) docs = loader.load() # Add file name to metadata for doc in docs: doc.metadata['source_file'] = uploaded_file.name documents.extend(docs) # Clean up temporary file os.unlink(tmp_file_path) st.success(f"✅ Processed: {uploaded_file.name} ({len(docs)} pages)") except Exception as e: st.error(f"❌ Error processing {uploaded_file.name}: {e}") if not documents: return None, [] # Split documents into chunks text_splitter = RecursiveCharacterTextSplitter( chunk_size=1000, chunk_overlap=200, length_function=len, separators=["\n\n", "\n", " ", ""] ) text_chunks = text_splitter.split_documents(documents) # Add metadata to chunks for i, text in enumerate(text_chunks): text.metadata.update({ "chunk_id": i, "chunk_size": len(text.page_content) }) st.info(f"✂️ Created {len(text_chunks)} text chunks") # Create FAISS vector store try: vectorstore = FAISS.from_documents(text_chunks, embeddings) st.success(f"✅ Successfully created vector database with {len(text_chunks)} chunks!") return vectorstore, text_chunks except Exception as e: st.error(f"❌ Error creating vector database: {e}") return None, [] def create_qa_chain(llm, vectorstore, k=5): """Create a question-answering chain with retrieval""" if not vectorstore or not llm: return None prompt_template = """Use the following context to answer the question. If you cannot find the answer in the context, say "I cannot find this information in the provided documents." Context: {context} Question: {question} Answer:""" PROMPT = PromptTemplate( template=prompt_template, input_variables=["context", "question"] ) try: qa_chain = RetrievalQA.from_chain_type( llm=llm, chain_type="stuff", retriever=vectorstore.as_retriever(search_kwargs={"k": k}), chain_type_kwargs={"prompt": PROMPT}, return_source_documents=True ) return qa_chain except Exception as e: st.error(f"Error creating QA chain: {e}") return None def ask_question(qa_chain, question): """Ask a question and get an answer with sources""" if not qa_chain: return None try: result = qa_chain({"query": question}) response = { "question": question, "answer": result["result"], "source_documents": result.get("source_documents", []) } return response except Exception as e: st.error(f"❌ Error processing question: {e}") return None def search_similar_chunks(vectorstore, query, k=5): """Search for similar chunks without generating an answer""" if not vectorstore: return [] try: results = vectorstore.similarity_search(query, k=k) return results except Exception as e: st.error(f"Error searching: {e}") return [] # Main App Interface def main(): st.markdown('