Spaces:
Sleeping
Sleeping
| import os | |
| import gdown | |
| import time | |
| import gradio as gr | |
| # Modern Imports | |
| from langchain_community.document_loaders import PyPDFLoader | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| from langchain_huggingface import HuggingFaceEmbeddings | |
| from langchain_community.vectorstores import FAISS | |
| from langchain_groq import ChatGroq | |
| from langchain_core.prompts import ChatPromptTemplate | |
| from langchain_core.runnables import RunnablePassthrough | |
| from langchain_core.output_parsers import StrOutputParser | |
| # ========================================== | |
| # 1. SETUP & KEYS | |
| # ========================================== | |
| os.environ["GROQ_API_KEY"] = os.getenv("GROQ_API_KEY") | |
| # --- UPDATE THIS LIST WITH ALL YOUR LINKS --- | |
| links_to_process = [ | |
| "https://drive.google.com/file/d/1rb7AeJZrDNR-bq8Q9V4IvtzYZsDOvDH0/view?usp=sharing", | |
| "https://drive.google.com/file/d/16PcJo_JaQHh1bx01lCAkc4QwQ6YnLb-K/view?usp=sharing" | |
| #"https://drive.google.com/drive/folders/ANOTHER_FOLDER_ID" | |
| ] | |
| output_dir = 'knowledge_base' | |
| if not os.path.exists(output_dir): | |
| os.makedirs(output_dir) | |
| # ========================================== | |
| # 2. IMPROVED DOWNLOAD LOGIC | |
| # ========================================== | |
| def build_vector_db(links): | |
| print(f"π₯ Starting synchronization for {len(links)} sources...") | |
| for link in links: | |
| try: | |
| if "/folders/" in link: | |
| print(f"π Syncing Folder: {link}") | |
| gdown.download_folder(url=link, output=output_dir, quiet=True, use_cookies=False) | |
| else: | |
| print(f"π Syncing Individual File: {link}") | |
| # Use output_dir + "/" to ensure it saves into the folder | |
| gdown.download(url=link, output=output_dir + "/", quiet=True) | |
| time.sleep(1) # Small pause to respect Drive rate limits | |
| except Exception as e: | |
| print(f"β οΈ Skip Link: Could not download {link}. Error: {e}") | |
| all_docs = [] | |
| # Use os.walk to find PDFs even inside subfolders downloaded by download_folder | |
| for root, dirs, files in os.walk(output_dir): | |
| for filename in files: | |
| if filename.endswith(".pdf"): | |
| file_path = os.path.join(root, filename) | |
| try: | |
| loader = PyPDFLoader(file_path) | |
| all_docs.extend(loader.load()) | |
| except Exception as e: | |
| print(f"β Error loading {filename}: {e}") | |
| if not all_docs: | |
| raise ValueError("No PDF documents found! Ensure links are set to 'Anyone with the link'.") | |
| # Chunking & Embeddings | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150) | |
| chunks = text_splitter.split_documents(all_docs) | |
| print(f"π§ Creating embeddings for {len(chunks)} text chunks...") | |
| embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") | |
| vector_db = FAISS.from_documents(chunks, embeddings) | |
| print("β Multi-Source Vector Database Created Successfully!") | |
| return vector_db | |
| # Initialize | |
| vector_store = build_vector_db(links_to_process) | |
| retriever = vector_store.as_retriever(search_kwargs={"k": 3}) | |
| # ========================================== | |
| # 3. MODERN RAG CHAIN | |
| # ========================================== | |
| llm = ChatGroq(model="llama-3.3-70b-versatile", temperature=0) | |
| template = """Answer the question based ONLY on the following context: | |
| {context} | |
| Question: {question} | |
| Helpful Answer:""" | |
| prompt = ChatPromptTemplate.from_template(template) | |
| rag_chain = ( | |
| {"context": retriever, "question": RunnablePassthrough()} | |
| | prompt | |
| | llm | |
| | StrOutputParser() | |
| ) | |
| # ========================================== | |
| # 4. PROFESSIONAL FRONTEND (GRADIO BLOCKS) | |
| # ========================================== | |
| custom_css = """ | |
| #main-container { max-width: 900px; margin: auto; padding: 20px; } | |
| .header-text { text-align: center; color: #1e293b; margin-bottom: 2px; } | |
| .report-box { background-color: #ffffff; border-radius: 8px; border: 1px solid #e2e8f0; padding: 15px; min-height: 200px; } | |
| """ | |
| def process_query(query): | |
| if not query.strip(): | |
| return "### β οΈ System Note\n*Please enter a strategic inquiry to begin analysis.*" | |
| try: | |
| return rag_chain.invoke(query) | |
| except Exception as e: | |
| return f"### β Error\nAn error occurred: {str(e)}" | |
| with gr.Blocks(theme=gr.themes.Soft(primary_hue="indigo"), css=custom_css) as demo: | |
| with gr.Column(elem_id="main-container"): | |
| gr.Markdown("# ποΈ Enterprise Knowledge Engine", elem_classes="header-text") | |
| gr.Markdown("<p style='text-align: center;'>Multi-Source Document Synthesis via Groq & FAISS</p>") | |
| gr.HTML("<hr>") | |
| user_input = gr.Textbox(label="Strategic Inquiry", placeholder="Ask a question about the collected knowledge base...", lines=3) | |
| with gr.Row(): | |
| submit_btn = gr.Button("ANALYZE DATA", variant="primary", scale=2) | |
| clear_btn = gr.ClearButton([user_input], value="RESET DASHBOARD", scale=1) | |
| gr.Markdown("### π Intelligence Report") | |
| with gr.Column(elem_classes="report-box"): | |
| output_display = gr.Markdown(value="_Awaiting input..._") | |
| submit_btn.click(fn=process_query, inputs=user_input, outputs=output_display) | |
| user_input.submit(fn=process_query, inputs=user_input, outputs=output_display) | |
| demo.launch(share=True) |