import os import gdown import time import gradio as gr # Modern Imports from langchain_community.document_loaders import PyPDFLoader from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain_huggingface import HuggingFaceEmbeddings from langchain_community.vectorstores import FAISS from langchain_groq import ChatGroq from langchain_core.prompts import ChatPromptTemplate from langchain_core.runnables import RunnablePassthrough from langchain_core.output_parsers import StrOutputParser # ========================================== # 1. SETUP & KEYS # ========================================== os.environ["GROQ_API_KEY"] = os.getenv("GROQ_API_KEY") # --- UPDATE THIS LIST WITH ALL YOUR LINKS --- links_to_process = [ "https://drive.google.com/file/d/1rb7AeJZrDNR-bq8Q9V4IvtzYZsDOvDH0/view?usp=sharing", "https://drive.google.com/file/d/16PcJo_JaQHh1bx01lCAkc4QwQ6YnLb-K/view?usp=sharing" #"https://drive.google.com/drive/folders/ANOTHER_FOLDER_ID" ] output_dir = 'knowledge_base' if not os.path.exists(output_dir): os.makedirs(output_dir) # ========================================== # 2. IMPROVED DOWNLOAD LOGIC # ========================================== def build_vector_db(links): print(f"📥 Starting synchronization for {len(links)} sources...") for link in links: try: if "/folders/" in link: print(f"📂 Syncing Folder: {link}") gdown.download_folder(url=link, output=output_dir, quiet=True, use_cookies=False) else: print(f"📄 Syncing Individual File: {link}") # Use output_dir + "/" to ensure it saves into the folder gdown.download(url=link, output=output_dir + "/", quiet=True) time.sleep(1) # Small pause to respect Drive rate limits except Exception as e: print(f"⚠️ Skip Link: Could not download {link}. Error: {e}") all_docs = [] # Use os.walk to find PDFs even inside subfolders downloaded by download_folder for root, dirs, files in os.walk(output_dir): for filename in files: if filename.endswith(".pdf"): file_path = os.path.join(root, filename) try: loader = PyPDFLoader(file_path) all_docs.extend(loader.load()) except Exception as e: print(f"❌ Error loading {filename}: {e}") if not all_docs: raise ValueError("No PDF documents found! Ensure links are set to 'Anyone with the link'.") # Chunking & Embeddings text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150) chunks = text_splitter.split_documents(all_docs) print(f"🧠 Creating embeddings for {len(chunks)} text chunks...") embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") vector_db = FAISS.from_documents(chunks, embeddings) print("✅ Multi-Source Vector Database Created Successfully!") return vector_db # Initialize vector_store = build_vector_db(links_to_process) retriever = vector_store.as_retriever(search_kwargs={"k": 3}) # ========================================== # 3. MODERN RAG CHAIN # ========================================== llm = ChatGroq(model="llama-3.3-70b-versatile", temperature=0) template = """Answer the question based ONLY on the following context: {context} Question: {question} Helpful Answer:""" prompt = ChatPromptTemplate.from_template(template) rag_chain = ( {"context": retriever, "question": RunnablePassthrough()} | prompt | llm | StrOutputParser() ) # ========================================== # 4. PROFESSIONAL FRONTEND (GRADIO BLOCKS) # ========================================== custom_css = """ #main-container { max-width: 900px; margin: auto; padding: 20px; } .header-text { text-align: center; color: #1e293b; margin-bottom: 2px; } .report-box { background-color: #ffffff; border-radius: 8px; border: 1px solid #e2e8f0; padding: 15px; min-height: 200px; } """ def process_query(query): if not query.strip(): return "### ⚠️ System Note\n*Please enter a strategic inquiry to begin analysis.*" try: return rag_chain.invoke(query) except Exception as e: return f"### ❌ Error\nAn error occurred: {str(e)}" with gr.Blocks(theme=gr.themes.Soft(primary_hue="indigo"), css=custom_css) as demo: with gr.Column(elem_id="main-container"): gr.Markdown("# 🏛️ Enterprise Knowledge Engine", elem_classes="header-text") gr.Markdown("

Multi-Source Document Synthesis via Groq & FAISS

") gr.HTML("
") user_input = gr.Textbox(label="Strategic Inquiry", placeholder="Ask a question about the collected knowledge base...", lines=3) with gr.Row(): submit_btn = gr.Button("ANALYZE DATA", variant="primary", scale=2) clear_btn = gr.ClearButton([user_input], value="RESET DASHBOARD", scale=1) gr.Markdown("### 📋 Intelligence Report") with gr.Column(elem_classes="report-box"): output_display = gr.Markdown(value="_Awaiting input..._") submit_btn.click(fn=process_query, inputs=user_input, outputs=output_display) user_input.submit(fn=process_query, inputs=user_input, outputs=output_display) demo.launch(share=True)