Spaces:
Running
Running
File size: 5,405 Bytes
0eb6862 40c6698 e68fa5e 40c6698 e68fa5e 40c6698 e68fa5e b7a10b9 40c6698 e68fa5e 40c6698 e68fa5e 40c6698 e68fa5e 40c6698 e68fa5e 40c6698 e68fa5e 40c6698 e68fa5e 40c6698 e68fa5e 40c6698 e68fa5e 40c6698 e68fa5e 40c6698 e68fa5e 40c6698 e68fa5e 31148e7 40c6698 31148e7 40c6698 e68fa5e 40c6698 31148e7 e68fa5e 40c6698 e68fa5e 40c6698 e68fa5e 40c6698 79941ec e68fa5e 40c6698 e26de1e 40c6698 e26de1e 40c6698 f3ed6f8 40c6698 f3ed6f8 40c6698 ad8feec 40c6698 f3ed6f8 40c6698 ad8feec 40c6698 ad8feec 40c6698 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 | import os
import gdown
import time
import gradio as gr
# Modern Imports
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
# ==========================================
# 1. SETUP & KEYS
# ==========================================
os.environ["GROQ_API_KEY"] = os.getenv("GROQ_API_KEY")
# --- UPDATE THIS LIST WITH ALL YOUR LINKS ---
links_to_process = [
"https://drive.google.com/file/d/1rb7AeJZrDNR-bq8Q9V4IvtzYZsDOvDH0/view?usp=sharing",
"https://drive.google.com/file/d/16PcJo_JaQHh1bx01lCAkc4QwQ6YnLb-K/view?usp=sharing"
#"https://drive.google.com/drive/folders/ANOTHER_FOLDER_ID"
]
output_dir = 'knowledge_base'
if not os.path.exists(output_dir):
os.makedirs(output_dir)
# ==========================================
# 2. IMPROVED DOWNLOAD LOGIC
# ==========================================
def build_vector_db(links):
print(f"π₯ Starting synchronization for {len(links)} sources...")
for link in links:
try:
if "/folders/" in link:
print(f"π Syncing Folder: {link}")
gdown.download_folder(url=link, output=output_dir, quiet=True, use_cookies=False)
else:
print(f"π Syncing Individual File: {link}")
# Use output_dir + "/" to ensure it saves into the folder
gdown.download(url=link, output=output_dir + "/", quiet=True)
time.sleep(1) # Small pause to respect Drive rate limits
except Exception as e:
print(f"β οΈ Skip Link: Could not download {link}. Error: {e}")
all_docs = []
# Use os.walk to find PDFs even inside subfolders downloaded by download_folder
for root, dirs, files in os.walk(output_dir):
for filename in files:
if filename.endswith(".pdf"):
file_path = os.path.join(root, filename)
try:
loader = PyPDFLoader(file_path)
all_docs.extend(loader.load())
except Exception as e:
print(f"β Error loading {filename}: {e}")
if not all_docs:
raise ValueError("No PDF documents found! Ensure links are set to 'Anyone with the link'.")
# Chunking & Embeddings
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
chunks = text_splitter.split_documents(all_docs)
print(f"π§ Creating embeddings for {len(chunks)} text chunks...")
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vector_db = FAISS.from_documents(chunks, embeddings)
print("β
Multi-Source Vector Database Created Successfully!")
return vector_db
# Initialize
vector_store = build_vector_db(links_to_process)
retriever = vector_store.as_retriever(search_kwargs={"k": 3})
# ==========================================
# 3. MODERN RAG CHAIN
# ==========================================
llm = ChatGroq(model="llama-3.3-70b-versatile", temperature=0)
template = """Answer the question based ONLY on the following context:
{context}
Question: {question}
Helpful Answer:"""
prompt = ChatPromptTemplate.from_template(template)
rag_chain = (
{"context": retriever, "question": RunnablePassthrough()}
| prompt
| llm
| StrOutputParser()
)
# ==========================================
# 4. PROFESSIONAL FRONTEND (GRADIO BLOCKS)
# ==========================================
custom_css = """
#main-container { max-width: 900px; margin: auto; padding: 20px; }
.header-text { text-align: center; color: #1e293b; margin-bottom: 2px; }
.report-box { background-color: #ffffff; border-radius: 8px; border: 1px solid #e2e8f0; padding: 15px; min-height: 200px; }
"""
def process_query(query):
if not query.strip():
return "### β οΈ System Note\n*Please enter a strategic inquiry to begin analysis.*"
try:
return rag_chain.invoke(query)
except Exception as e:
return f"### β Error\nAn error occurred: {str(e)}"
with gr.Blocks(theme=gr.themes.Soft(primary_hue="indigo"), css=custom_css) as demo:
with gr.Column(elem_id="main-container"):
gr.Markdown("# ποΈ Enterprise Knowledge Engine", elem_classes="header-text")
gr.Markdown("<p style='text-align: center;'>Multi-Source Document Synthesis via Groq & FAISS</p>")
gr.HTML("<hr>")
user_input = gr.Textbox(label="Strategic Inquiry", placeholder="Ask a question about the collected knowledge base...", lines=3)
with gr.Row():
submit_btn = gr.Button("ANALYZE DATA", variant="primary", scale=2)
clear_btn = gr.ClearButton([user_input], value="RESET DASHBOARD", scale=1)
gr.Markdown("### π Intelligence Report")
with gr.Column(elem_classes="report-box"):
output_display = gr.Markdown(value="_Awaiting input..._")
submit_btn.click(fn=process_query, inputs=user_input, outputs=output_display)
user_input.submit(fn=process_query, inputs=user_input, outputs=output_display)
demo.launch(share=True) |