File size: 5,405 Bytes
0eb6862
40c6698
 
e68fa5e
 
40c6698
e68fa5e
 
 
 
 
 
 
 
 
 
40c6698
e68fa5e
b7a10b9
40c6698
 
 
 
 
 
 
 
 
 
 
e68fa5e
 
40c6698
e68fa5e
40c6698
 
e68fa5e
40c6698
 
 
 
 
 
 
 
 
e68fa5e
40c6698
 
 
e68fa5e
40c6698
 
 
 
 
 
 
 
 
 
 
e68fa5e
 
40c6698
e68fa5e
40c6698
 
 
e68fa5e
40c6698
 
e68fa5e
 
40c6698
e68fa5e
 
40c6698
 
 
e68fa5e
31148e7
40c6698
31148e7
40c6698
e68fa5e
40c6698
31148e7
e68fa5e
40c6698
 
 
e68fa5e
 
 
 
 
 
 
 
 
 
 
40c6698
e68fa5e
40c6698
 
 
 
 
79941ec
e68fa5e
 
40c6698
e26de1e
40c6698
e26de1e
40c6698
f3ed6f8
40c6698
 
 
 
 
f3ed6f8
40c6698
ad8feec
40c6698
 
 
f3ed6f8
40c6698
 
 
ad8feec
40c6698
 
ad8feec
40c6698
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import os
import gdown
import time
import gradio as gr

# Modern Imports
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

# ==========================================
# 1. SETUP & KEYS
# ==========================================
os.environ["GROQ_API_KEY"] = os.getenv("GROQ_API_KEY")

# --- UPDATE THIS LIST WITH ALL YOUR LINKS ---
links_to_process = [
    "https://drive.google.com/file/d/1rb7AeJZrDNR-bq8Q9V4IvtzYZsDOvDH0/view?usp=sharing",
    "https://drive.google.com/file/d/16PcJo_JaQHh1bx01lCAkc4QwQ6YnLb-K/view?usp=sharing"
    #"https://drive.google.com/drive/folders/ANOTHER_FOLDER_ID"
]

output_dir = 'knowledge_base'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# ==========================================
# 2. IMPROVED DOWNLOAD LOGIC
# ==========================================
def build_vector_db(links):
    print(f"πŸ“₯ Starting synchronization for {len(links)} sources...")

    for link in links:
        try:
            if "/folders/" in link:
                print(f"πŸ“‚ Syncing Folder: {link}")
                gdown.download_folder(url=link, output=output_dir, quiet=True, use_cookies=False)
            else:
                print(f"πŸ“„ Syncing Individual File: {link}")
                # Use output_dir + "/" to ensure it saves into the folder
                gdown.download(url=link, output=output_dir + "/", quiet=True)

            time.sleep(1) # Small pause to respect Drive rate limits
        except Exception as e:
            print(f"⚠️ Skip Link: Could not download {link}. Error: {e}")

    all_docs = []
    # Use os.walk to find PDFs even inside subfolders downloaded by download_folder
    for root, dirs, files in os.walk(output_dir):
        for filename in files:
            if filename.endswith(".pdf"):
                file_path = os.path.join(root, filename)
                try:
                    loader = PyPDFLoader(file_path)
                    all_docs.extend(loader.load())
                except Exception as e:
                    print(f"❌ Error loading {filename}: {e}")

    if not all_docs:
        raise ValueError("No PDF documents found! Ensure links are set to 'Anyone with the link'.")

    # Chunking & Embeddings
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
    chunks = text_splitter.split_documents(all_docs)

    print(f"🧠 Creating embeddings for {len(chunks)} text chunks...")
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

    vector_db = FAISS.from_documents(chunks, embeddings)
    print("βœ… Multi-Source Vector Database Created Successfully!")
    return vector_db

# Initialize
vector_store = build_vector_db(links_to_process)
retriever = vector_store.as_retriever(search_kwargs={"k": 3})

# ==========================================
# 3. MODERN RAG CHAIN
# ==========================================
llm = ChatGroq(model="llama-3.3-70b-versatile", temperature=0)

template = """Answer the question based ONLY on the following context:
{context}

Question: {question}

Helpful Answer:"""

prompt = ChatPromptTemplate.from_template(template)

rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

# ==========================================
# 4. PROFESSIONAL FRONTEND (GRADIO BLOCKS)
# ==========================================
custom_css = """
#main-container { max-width: 900px; margin: auto; padding: 20px; }
.header-text { text-align: center; color: #1e293b; margin-bottom: 2px; }
.report-box { background-color: #ffffff; border-radius: 8px; border: 1px solid #e2e8f0; padding: 15px; min-height: 200px; }
"""

def process_query(query):
    if not query.strip():
        return "### ⚠️ System Note\n*Please enter a strategic inquiry to begin analysis.*"
    try:
        return rag_chain.invoke(query)
    except Exception as e:
        return f"### ❌ Error\nAn error occurred: {str(e)}"

with gr.Blocks(theme=gr.themes.Soft(primary_hue="indigo"), css=custom_css) as demo:
    with gr.Column(elem_id="main-container"):
        gr.Markdown("# πŸ›οΈ Enterprise Knowledge Engine", elem_classes="header-text")
        gr.Markdown("<p style='text-align: center;'>Multi-Source Document Synthesis via Groq & FAISS</p>")
        gr.HTML("<hr>")

        user_input = gr.Textbox(label="Strategic Inquiry", placeholder="Ask a question about the collected knowledge base...", lines=3)

        with gr.Row():
            submit_btn = gr.Button("ANALYZE DATA", variant="primary", scale=2)
            clear_btn = gr.ClearButton([user_input], value="RESET DASHBOARD", scale=1)

        gr.Markdown("### πŸ“‹ Intelligence Report")
        with gr.Column(elem_classes="report-box"):
            output_display = gr.Markdown(value="_Awaiting input..._")

    submit_btn.click(fn=process_query, inputs=user_input, outputs=output_display)
    user_input.submit(fn=process_query, inputs=user_input, outputs=output_display)

demo.launch(share=True)