File size: 7,232 Bytes
c789ee4
 
 
 
 
 
 
 
 
 
 
 
933c098
e5ecb65
c789ee4
 
 
 
 
 
 
 
 
 
 
 
 
933c098
 
c789ee4
 
 
933c098
 
 
 
 
 
 
c789ee4
 
 
 
 
 
 
 
 
 
 
 
 
933c098
c789ee4
 
 
 
 
 
 
 
 
933c098
c789ee4
 
933c098
c789ee4
 
933c098
 
c789ee4
933c098
 
c789ee4
 
 
 
 
 
933c098
c789ee4
 
933c098
c789ee4
 
933c098
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c789ee4
 
 
 
 
 
 
 
 
 
 
933c098
 
 
 
c789ee4
 
 
 
 
933c098
 
 
 
c789ee4
933c098
c789ee4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
933c098
 
 
c789ee4
 
 
 
933c098
c789ee4
933c098
c789ee4
 
 
 
 
 
 
933c098
c789ee4
933c098
 
958c5ce
933c098
 
 
 
 
 
 
c789ee4
 
933c098
c789ee4
 
933c098
c789ee4
 
 
195ceeb
c789ee4
 
933c098
 
c789ee4
195ceeb
 
c789ee4
 
 
933c098
c789ee4
933c098
c789ee4
933c098
 
 
 
 
 
c789ee4
 
 
 
933c098
 
 
 
 
 
 
 
c789ee4
933c098
c789ee4
 
933c098
195ceeb
 
933c098
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
import os
import uuid
import chromadb
import gradio as gr
from pypdf import PdfReader
import docx
from sentence_transformers import SentenceTransformer
from groq import Groq

# =========================
# πŸ”‘ GROQ API (HF SECRET)
# =========================
# Set your secret as "GROQ_API_KEY" in HF Space Settings β†’ Variables and secrets
groq_client = Groq(api_key=os.getenv("Multi_doc"))

# =========================
# πŸ“„ LOAD DOCUMENTS
# =========================
def load_pdf(path):
    reader = PdfReader(path)
    return "\n".join([p.extract_text() or "" for p in reader.pages])

def load_docx(path):
    doc = docx.Document(path)
    return "\n".join([p.text for p in doc.paragraphs])

def load_txt(path):
    with open(path, "r", encoding="utf-8") as f:
        return f.read()

def load_document(path):
    ext = path.split(".")[-1].lower()
    if ext == "pdf":
        return load_pdf(path)
    if ext == "docx":
        return load_docx(path)
    if ext == "txt":
        return load_txt(path)
    raise ValueError(f"Unsupported file type: .{ext}")

# =========================
# βœ‚οΈ CHUNKING
# =========================
def chunk_text(text, size=400, overlap=80):
    words = text.split()
    chunks = []
    i = 0
    cid = 0

    while i < len(words):
        chunks.append({
            "id": cid,
            "text": " ".join(words[i:i + size])
        })
        i += size - overlap
        cid += 1

    return chunks

# =========================
# 🧠 EMBEDDINGS (LOCAL)
# =========================
embed_model = SentenceTransformer("all-MiniLM-L6-v2")

def embed(texts):
    return embed_model.encode(texts, show_progress_bar=False).tolist()

# =========================
# πŸ—„οΈ CHROMA DB
# HF Spaces has a read-only root β€” use /tmp for writable storage
# =========================
chroma_client = chromadb.PersistentClient(path="/tmp/chroma_db")
collection = chroma_client.get_or_create_collection("rag")

# =========================
# πŸ“ PROCESS FILES
# =========================
def process_files(files):
    if not files:
        return "⚠️ No files uploaded."

    all_chunks = []
    errors = []

    for f in files:
        # Gradio on HF passes file path as a string or NamedString
        file_path = f if isinstance(f, str) else f.name
        if not file_path:
            continue
        try:
            text = load_document(file_path)
            if not text.strip():
                errors.append(f"⚠️ {os.path.basename(file_path)} appears empty.")
                continue
            chunks = chunk_text(text)
            for c in chunks:
                all_chunks.append({
                    "source": os.path.basename(file_path),
                    "text": c["text"]
                })
        except Exception as e:
            errors.append(f"❌ Error reading {os.path.basename(file_path)}: {e}")

    if not all_chunks:
        return "\n".join(errors) if errors else "⚠️ No content could be extracted."

    texts = [c["text"] for c in all_chunks]
    embeddings = embed(texts)

    collection.add(
        ids=[str(uuid.uuid4()) for _ in all_chunks],
        embeddings=embeddings,
        documents=texts,
        metadatas=[{"source": c["source"]} for c in all_chunks]
    )

    result = f"βœ… Indexed {len(files)} file(s) β€” {len(all_chunks)} chunks stored."
    if errors:
        result += "\n" + "\n".join(errors)
    return result

# =========================
# πŸ” RETRIEVAL
# =========================
def retrieve(query, k=3):
    # Guard: collection might be empty
    count = collection.count()
    if count == 0:
        return []

    k = min(k, count)  # Can't retrieve more than what's stored
    q_emb = embed([query])[0]

    results = collection.query(
        query_embeddings=[q_emb],
        n_results=k
    )

    docs = []
    for i in range(len(results["documents"][0])):
        docs.append({
            "text": results["documents"][0][i],
            "source": results["metadatas"][0][i]["source"]
        })

    return docs

# =========================
# πŸ€– GROQ GENERATION
# =========================
def generate(query):
    docs = retrieve(query)

    if not docs:
        return "⚠️ No documents indexed yet. Please upload and process files first."

    context = "\n\n".join(
        [f"[{d['source']}]\n{d['text']}" for d in docs]
    )

    prompt = f"""You are a strict RAG assistant.
Answer ONLY from the context below.
If the answer is not found in the context, say: "Not found in documents."

CONTEXT:
{context}

QUESTION:
{query}

ANSWER:"""

    try:
        response = groq_client.chat.completions.create(
            model="llama-3.1-8b-instant",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.2,
            max_tokens=1024,
        )
        answer = response.choices[0].message.content
    except Exception as e:
        return f"❌ Groq API error: {e}"

    sources = "\n\n".join(
        [f"πŸ“„ **{d['source']}**\n{d['text'][:200]}…" for d in docs]
    )

    return f"{answer}\n\n---\nπŸ“š **Sources:**\n{sources}"

# =========================
# πŸ’¬ CHAT FUNCTION
# Gradio 5 uses {"role": ..., "content": ...} dicts, not tuples
# =========================
def chat(message, history):
    if not message.strip():
        return "", history
    reply = generate(message)
    history.append({"role": "user", "content": message})
    history.append({"role": "assistant", "content": reply})
    return "", history

# =========================
# 🎨 GRADIO UI
# =========================
with gr.Blocks(title="Groq RAG Assistant") as app:

    gr.Markdown(
        """# 🧠 Groq RAG Assistant
        Upload your documents, then ask questions about them.
        Powered by **Groq LLaMA3** + **ChromaDB** + **sentence-transformers**.
        """
    )

    with gr.Row():

        with gr.Column(scale=1):
            gr.Markdown("### πŸ“‚ Upload Documents")
            files = gr.File(
                file_count="multiple",
                file_types=[".pdf", ".docx", ".txt"],
                label="Upload PDF / DOCX / TXT"
            )
            process_btn = gr.Button("πŸš€ Process Files", variant="primary")
            status = gr.Textbox(label="Status", interactive=False)

            process_btn.click(fn=process_files, inputs=files, outputs=status)

        with gr.Column(scale=2):
            gr.Markdown("### πŸ’¬ Ask Your Documents")
            # Gradio 5: type="messages" uses the new dict format
            chatbot = gr.Chatbot(height=480, type="messages")
            msg = gr.Textbox(
                placeholder="Ask a question about your documents…",
                label="Your question",
                lines=2
            )
            with gr.Row():
                submit_btn = gr.Button("Send", variant="primary")
                clear_btn = gr.Button("Clear Chat")

            submit_btn.click(fn=chat, inputs=[msg, chatbot], outputs=[msg, chatbot])
            msg.submit(fn=chat, inputs=[msg, chatbot], outputs=[msg, chatbot])
            clear_btn.click(fn=lambda: ([], ""), outputs=[chatbot, msg])

# =========================
# πŸš€ LAUNCH
# =========================
if __name__ == "__main__":
    app.launch()