# -*- coding: utf-8 -*- """final_app Automatically generated by Colab. Original file is located at https://colab.research.google.com/drive/1pG3uDsJzglvQecdTcY76aXa5ObFadRux """ # !pip install gradio langchain langchain-community langchain-huggingface langchain-groq faiss-cpu sentence-transformers pypdf import gradio as gr import os import tempfile from langchain_community.document_loaders import PyPDFLoader from langchain_community.vectorstores import FAISS from langchain_huggingface import HuggingFaceEmbeddings from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain_groq import ChatGroq from langchain.chains import RetrievalQA from langchain.prompts import PromptTemplate # Groq API Key GROQ_API_KEY = "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" os.environ["GROQ_API_KEY"] = GROQ_API_KEY # Global variables to store vectorstore and processed files vectorstore = None processed_files_list = [] def process_pdfs(files): """Process uploaded PDF files and create vector store""" global vectorstore, processed_files_list if not files: return "โ ๏ธ Please upload at least one PDF file", "" try: all_documents = [] processed_names = [] # Process each uploaded PDF for file in files: # Load PDF loader = PyPDFLoader(file.name) documents = loader.load() all_documents.extend(documents) processed_names.append(os.path.basename(file.name)) if not all_documents: return "โ No content extracted from PDFs", "" # Split documents into chunks text_splitter = RecursiveCharacterTextSplitter( chunk_size=1000, chunk_overlap=200, length_function=len ) splits = text_splitter.split_documents(all_documents) # Create embeddings embeddings = HuggingFaceEmbeddings( model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={'device': 'cpu'} ) # Create vector store vectorstore = FAISS.from_documents(splits, embeddings) processed_files_list = processed_names success_msg = f"โ Successfully processed {len(files)} document(s)!\n" success_msg += f"๐ Created {len(splits)} text chunks for retrieval\n\n" success_msg += "๐ Processed files:\n" + "\n".join([f" โข {name}" for name in processed_names]) return success_msg, "โ Documents processed! You can now ask questions." except Exception as e: return f"โ Error processing documents: {str(e)}", "" def answer_question(question, chat_history): """Answer questions based on the processed documents""" global vectorstore if not vectorstore: return chat_history + [[question, "โ ๏ธ Please upload and process PDF documents first!"]] if not question or question.strip() == "": return chat_history + [[question, "โ ๏ธ Please enter a valid question."]] try: # Initialize LLM with stricter temperature for factual answers llm = ChatGroq( model="llama-3.1-8b-instant", temperature=0, # Set to 0 for most deterministic, factual responses max_tokens=1024, api_key=GROQ_API_KEY ) # Create custom prompt with strict context-only answering prompt_template = """You are a helpful assistant that answers questions ONLY based on the provided context from uploaded PDF documents. CRITICAL INSTRUCTIONS: - Answer ONLY if the information is present in the context below - If the context does not contain relevant information to answer the question, you MUST respond with: "I don't know the answer. This information is not available in the uploaded documents." - DO NOT use any external knowledge or information not present in the context - DO NOT make assumptions or inferences beyond what is explicitly stated in the context - If you're unsure whether the context contains the answer, say you don't know Context from uploaded documents: {context} Question: {question} Answer (only from the context above):""" PROMPT = PromptTemplate( template=prompt_template, input_variables=["context", "question"] ) # Create retrieval chain with enhanced retrieval settings qa_chain = RetrievalQA.from_chain_type( llm=llm, chain_type="stuff", retriever=vectorstore.as_retriever( search_type="similarity", search_kwargs={ "k": 5, # Retrieve top 5 most relevant chunks "fetch_k": 20 # Fetch more candidates before filtering } ), chain_type_kwargs={"prompt": PROMPT}, return_source_documents=True ) # Get response result = qa_chain({"query": question}) answer = result['result'] source_docs = result.get('source_documents', []) # Add source information if available if source_docs and "don't know" not in answer.lower(): answer += "\n\n๐ **Sources found in documents:**" unique_sources = set() for doc in source_docs[:3]: # Show top 3 sources source = doc.metadata.get('source', 'Unknown') page = doc.metadata.get('page', 'Unknown') source_id = f"{source} (Page {page})" if source_id not in unique_sources: unique_sources.add(source_id) for source in unique_sources: answer += f"\n โข {source}" # Update chat history chat_history = chat_history + [[question, answer]] return chat_history except Exception as e: error_msg = f"โ Error generating answer: {str(e)}" return chat_history + [[question, error_msg]] def clear_data(): """Clear all processed data""" global vectorstore, processed_files_list vectorstore = None processed_files_list = [] return "๐๏ธ All data cleared. Please upload new documents.", "", [] # Custom CSS for better styling custom_css = """ #title { text-align: center; background: linear-gradient(90deg, #667eea 0%, #764ba2 100%); -webkit-background-clip: text; -webkit-text-fill-color: transparent; font-size: 2.5em; font-weight: bold; margin-bottom: 10px; } #subtitle { text-align: center; color: #666; font-size: 1.2em; margin-bottom: 20px; } .gradio-container { max-width: 1200px !important; margin: auto !important; } """ # Create Gradio interface with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo: # Header gr.HTML("
Upload PDFs and ask questions using AI-powered retrieval
") with gr.Row(): # Left column - Document Upload with gr.Column(scale=1): gr.Markdown("### ๐ Document Upload") file_upload = gr.File( label="Upload PDF Documents", file_types=[".pdf"], file_count="multiple" ) process_btn = gr.Button("๐ Process Documents", variant="primary", size="lg") process_output = gr.Textbox( label="Processing Status", lines=8, interactive=False ) clear_btn = gr.Button("๐๏ธ Clear All Data", variant="stop") gr.Markdown(""" --- ### โน๏ธ How to Use 1. **Upload PDFs** using the file uploader 2. Click **Process Documents** 3. **Ask questions** in the chat 4. Get **AI-powered answers** **Features:** - ๐ Multiple PDF support - ๐ค Powered by Groq LLM - ๐ Semantic search - ๐พ Chat history """) # Right column - Chat Interface with gr.Column(scale=2): gr.Markdown("### ๐ฌ Ask Questions") status_text = gr.Textbox( label="Status", value="โ ๏ธ Upload and process documents to start", interactive=False ) chatbot = gr.Chatbot( label="Chat History", height=400, show_label=True ) with gr.Row(): question_input = gr.Textbox( label="Your Question", placeholder="Ask anything about your documents...", scale=4 ) submit_btn = gr.Button("๐ Ask", variant="primary", scale=1) clear_chat_btn = gr.Button("๐งน Clear Chat") # Footer gr.HTML("""Powered by Langchain, Groq, and HuggingFace | Built with โค๏ธ using Gradio