# Patch cached_download import for compatibility with newer huggingface-hub import sys import types try: from huggingface_hub import cached_download except ImportError: import huggingface_hub huggingface_hub.cached_download = lambda *args, **kwargs: None import os import io import requests import pdfplumber import numpy as np import faiss import gradio as gr from sklearn.preprocessing import normalize from sentence_transformers import SentenceTransformer # ========================================================= # ✅ Global Variables # ========================================================= DOCS = [] FAISS_INDEX = None GROQ_API_KEY = os.environ.get("GROQ_API_KEY") # ========================================================= # ✅ Embedding Model Setup # ========================================================= embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") # ========================================================= # ✅ Helper Functions # ========================================================= def extract_text_from_pdf(file_bytes): text = "" with pdfplumber.open(io.BytesIO(file_bytes)) as pdf: for page in pdf.pages: page_text = page.extract_text() or "" text += page_text + "\n" return text.strip() def chunk_text(text, chunk_size=700): words = text.split() return [" ".join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)] def embed_texts(texts): embeddings = embedder.encode(texts) embeddings = normalize(embeddings) return np.array(embeddings).astype("float32") def build_faiss_index(embeddings): dim = embeddings.shape[1] index = faiss.IndexFlatL2(dim) index.add(embeddings) return index def search_docs(query, k=4): global DOCS, FAISS_INDEX if not DOCS or FAISS_INDEX is None: return ["⚠️ Please upload and process a PDF first."] q_emb = embed_texts([query]) D, I = FAISS_INDEX.search(q_emb, k) return [DOCS[i]["text"] for i in I[0]] # ========================================================= # ✅ GROQ API Chat Function # ========================================================= def call_groq_chat(system_prompt, user_prompt): if not GROQ_API_KEY: return "⚠️ Missing GROQ_API_KEY. Please set it in Hugging Face Space secrets." url = "https://api.groq.com/openai/v1/chat/completions" headers = {"Authorization": f"Bearer {GROQ_API_KEY}", "Content-Type": "application/json"} body = { "model": "llama-3.1-8b-instant", "messages": [ {"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt} ], "temperature": 0.3 } try: resp = requests.post(url, headers=headers, json=body, timeout=30) if resp.status_code == 401: return "❌ Unauthorized: Invalid or missing Groq API key." if resp.status_code == 404: return "❌ API endpoint or model not found." if resp.status_code == 429: return "⚠️ Too many requests. Please try again later." resp.raise_for_status() return resp.json()["choices"][0]["message"]["content"] except Exception as e: return f"❌ Error contacting Groq API: {str(e)}" # ========================================================= # ✅ Process PDF # ========================================================= def process_pdf(file_obj): global DOCS, FAISS_INDEX if file_obj is None: yield "⚠️ Please upload a PDF first." return try: yield "📥 Reading PDF..." raw = None if isinstance(file_obj, dict) and "data" in file_obj: raw = file_obj["data"] elif hasattr(file_obj, "read"): raw = file_obj.read() elif isinstance(file_obj, str) and os.path.exists(file_obj): with open(file_obj, "rb") as f: raw = f.read() if raw is None: yield f"❌ Unsupported file type: {type(file_obj)}" return yield "✏️ Extracting text..." text = extract_text_from_pdf(raw) if not text.strip(): yield "⚠️ No extractable text found." return yield "📄 Splitting text into chunks..." chunks = chunk_text(text) yield "🧠 Creating embeddings..." DOCS = [{"text": c} for c in chunks] embs = embed_texts([d["text"] for d in DOCS]) yield "📦 Building FAISS index..." FAISS_INDEX = build_faiss_index(embs) yield f"✅ Successfully processed {len(chunks)} chunks." except Exception as e: yield f"❌ Error processing PDF: {str(e)}" # ========================================================= # ✅ Answer Question # ========================================================= def answer_question(query, history): if not DOCS or FAISS_INDEX is None: return history + [["⚠️ Please upload and process a PDF first.", ""]] related = search_docs(query) context = "\n\n".join(related) system_prompt = "You are a helpful assistant answering based on the provided document." user_prompt = f"Document context:\n{context}\n\nUser question: {query}" answer = call_groq_chat(system_prompt, user_prompt) history.append([query, answer]) return history # ========================================================= # ✅ UI Design (Modern Look) # ========================================================= with gr.Blocks( theme=gr.themes.Soft(primary_hue="blue", secondary_hue="gray"), css=""" body {background: linear-gradient(135deg, #e3f2fd, #bbdefb);} .gradio-container {max-width: 900px !important; margin: auto;} .chatbox {height: 400px; overflow: auto; background: white; border-radius: 12px; box-shadow: 0 2px 10px rgba(0,0,0,0.1); padding: 10px;} .status-box {background: #f0f8ff; border-radius: 8px; padding: 10px; color: #333;} h1 {text-align:center; font-size: 2em; color: #0d47a1;} """ ) as app: gr.Markdown("
Powered by Groq + FAISS + Gradio
") with gr.Row(): pdf_file = gr.File(label="📂 Upload PDF", file_types=[".pdf"]) process_btn = gr.Button("⚙️ Process PDF", variant="primary") status_box = gr.Textbox(label="📊 Status", elem_classes="status-box", interactive=False) process_btn.click(process_pdf, inputs=pdf_file, outputs=status_box) gr.Markdown("### 💬 Ask Questions About Your PDF") chatbot = gr.Chatbot(label="Chat", elem_classes="chatbox", bubble_full_width=False) query_box = gr.Textbox(label="Type your question here...") clear_btn = gr.Button("🧹 Clear Chat") query_box.submit(answer_question, [query_box, chatbot], chatbot) clear_btn.click(lambda: None, None, chatbot, queue=False) # ========================================================= # ✅ Launch # ========================================================= if __name__ == "__main__": app.launch()