abid-ai's picture
Update app.py
b7a10b9 verified
import os
import gdown
import time
import gradio as gr
# Modern Imports
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
# ==========================================
# 1. SETUP & KEYS
# ==========================================
os.environ["GROQ_API_KEY"] = os.getenv("GROQ_API_KEY")
# --- UPDATE THIS LIST WITH ALL YOUR LINKS ---
links_to_process = [
"https://drive.google.com/file/d/1rb7AeJZrDNR-bq8Q9V4IvtzYZsDOvDH0/view?usp=sharing",
"https://drive.google.com/file/d/16PcJo_JaQHh1bx01lCAkc4QwQ6YnLb-K/view?usp=sharing"
#"https://drive.google.com/drive/folders/ANOTHER_FOLDER_ID"
]
output_dir = 'knowledge_base'
if not os.path.exists(output_dir):
os.makedirs(output_dir)
# ==========================================
# 2. IMPROVED DOWNLOAD LOGIC
# ==========================================
def build_vector_db(links):
print(f"πŸ“₯ Starting synchronization for {len(links)} sources...")
for link in links:
try:
if "/folders/" in link:
print(f"πŸ“‚ Syncing Folder: {link}")
gdown.download_folder(url=link, output=output_dir, quiet=True, use_cookies=False)
else:
print(f"πŸ“„ Syncing Individual File: {link}")
# Use output_dir + "/" to ensure it saves into the folder
gdown.download(url=link, output=output_dir + "/", quiet=True)
time.sleep(1) # Small pause to respect Drive rate limits
except Exception as e:
print(f"⚠️ Skip Link: Could not download {link}. Error: {e}")
all_docs = []
# Use os.walk to find PDFs even inside subfolders downloaded by download_folder
for root, dirs, files in os.walk(output_dir):
for filename in files:
if filename.endswith(".pdf"):
file_path = os.path.join(root, filename)
try:
loader = PyPDFLoader(file_path)
all_docs.extend(loader.load())
except Exception as e:
print(f"❌ Error loading {filename}: {e}")
if not all_docs:
raise ValueError("No PDF documents found! Ensure links are set to 'Anyone with the link'.")
# Chunking & Embeddings
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
chunks = text_splitter.split_documents(all_docs)
print(f"🧠 Creating embeddings for {len(chunks)} text chunks...")
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vector_db = FAISS.from_documents(chunks, embeddings)
print("βœ… Multi-Source Vector Database Created Successfully!")
return vector_db
# Initialize
vector_store = build_vector_db(links_to_process)
retriever = vector_store.as_retriever(search_kwargs={"k": 3})
# ==========================================
# 3. MODERN RAG CHAIN
# ==========================================
llm = ChatGroq(model="llama-3.3-70b-versatile", temperature=0)
template = """Answer the question based ONLY on the following context:
{context}
Question: {question}
Helpful Answer:"""
prompt = ChatPromptTemplate.from_template(template)
rag_chain = (
{"context": retriever, "question": RunnablePassthrough()}
| prompt
| llm
| StrOutputParser()
)
# ==========================================
# 4. PROFESSIONAL FRONTEND (GRADIO BLOCKS)
# ==========================================
custom_css = """
#main-container { max-width: 900px; margin: auto; padding: 20px; }
.header-text { text-align: center; color: #1e293b; margin-bottom: 2px; }
.report-box { background-color: #ffffff; border-radius: 8px; border: 1px solid #e2e8f0; padding: 15px; min-height: 200px; }
"""
def process_query(query):
if not query.strip():
return "### ⚠️ System Note\n*Please enter a strategic inquiry to begin analysis.*"
try:
return rag_chain.invoke(query)
except Exception as e:
return f"### ❌ Error\nAn error occurred: {str(e)}"
with gr.Blocks(theme=gr.themes.Soft(primary_hue="indigo"), css=custom_css) as demo:
with gr.Column(elem_id="main-container"):
gr.Markdown("# πŸ›οΈ Enterprise Knowledge Engine", elem_classes="header-text")
gr.Markdown("<p style='text-align: center;'>Multi-Source Document Synthesis via Groq & FAISS</p>")
gr.HTML("<hr>")
user_input = gr.Textbox(label="Strategic Inquiry", placeholder="Ask a question about the collected knowledge base...", lines=3)
with gr.Row():
submit_btn = gr.Button("ANALYZE DATA", variant="primary", scale=2)
clear_btn = gr.ClearButton([user_input], value="RESET DASHBOARD", scale=1)
gr.Markdown("### πŸ“‹ Intelligence Report")
with gr.Column(elem_classes="report-box"):
output_display = gr.Markdown(value="_Awaiting input..._")
submit_btn.click(fn=process_query, inputs=user_input, outputs=output_display)
user_input.submit(fn=process_query, inputs=user_input, outputs=output_display)
demo.launch(share=True)