Spaces:
Sleeping
Sleeping
| # Patch cached_download import for compatibility with newer huggingface-hub | |
| import sys | |
| import types | |
| try: | |
| from huggingface_hub import cached_download | |
| except ImportError: | |
| import huggingface_hub | |
| huggingface_hub.cached_download = lambda *args, **kwargs: None | |
| import os | |
| import io | |
| import requests | |
| import pdfplumber | |
| import numpy as np | |
| import faiss | |
| import gradio as gr | |
| from sklearn.preprocessing import normalize | |
| from sentence_transformers import SentenceTransformer | |
| # ========================================================= | |
| # β Global Variables | |
| # ========================================================= | |
| DOCS = [] | |
| FAISS_INDEX = None | |
| GROQ_API_KEY = os.environ.get("GROQ_API_KEY") | |
| # ========================================================= | |
| # β Embedding Model Setup | |
| # ========================================================= | |
| embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") | |
| # ========================================================= | |
| # β Helper Functions | |
| # ========================================================= | |
| def extract_text_from_pdf(file_bytes): | |
| text = "" | |
| with pdfplumber.open(io.BytesIO(file_bytes)) as pdf: | |
| for page in pdf.pages: | |
| page_text = page.extract_text() or "" | |
| text += page_text + "\n" | |
| return text.strip() | |
| def chunk_text(text, chunk_size=700): | |
| words = text.split() | |
| return [" ".join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)] | |
| def embed_texts(texts): | |
| embeddings = embedder.encode(texts) | |
| embeddings = normalize(embeddings) | |
| return np.array(embeddings).astype("float32") | |
| def build_faiss_index(embeddings): | |
| dim = embeddings.shape[1] | |
| index = faiss.IndexFlatL2(dim) | |
| index.add(embeddings) | |
| return index | |
| def search_docs(query, k=4): | |
| global DOCS, FAISS_INDEX | |
| if not DOCS or FAISS_INDEX is None: | |
| return ["β οΈ Please upload and process a PDF first."] | |
| q_emb = embed_texts([query]) | |
| D, I = FAISS_INDEX.search(q_emb, k) | |
| return [DOCS[i]["text"] for i in I[0]] | |
| # ========================================================= | |
| # β GROQ API Chat Function | |
| # ========================================================= | |
| def call_groq_chat(system_prompt, user_prompt): | |
| if not GROQ_API_KEY: | |
| return "β οΈ Missing GROQ_API_KEY. Please set it in Hugging Face Space secrets." | |
| url = "https://api.groq.com/openai/v1/chat/completions" | |
| headers = {"Authorization": f"Bearer {GROQ_API_KEY}", "Content-Type": "application/json"} | |
| body = { | |
| "model": "llama-3.1-8b-instant", | |
| "messages": [ | |
| {"role": "system", "content": system_prompt}, | |
| {"role": "user", "content": user_prompt} | |
| ], | |
| "temperature": 0.3 | |
| } | |
| try: | |
| resp = requests.post(url, headers=headers, json=body, timeout=30) | |
| if resp.status_code == 401: | |
| return "β Unauthorized: Invalid or missing Groq API key." | |
| if resp.status_code == 404: | |
| return "β API endpoint or model not found." | |
| if resp.status_code == 429: | |
| return "β οΈ Too many requests. Please try again later." | |
| resp.raise_for_status() | |
| return resp.json()["choices"][0]["message"]["content"] | |
| except Exception as e: | |
| return f"β Error contacting Groq API: {str(e)}" | |
| # ========================================================= | |
| # β Process PDF | |
| # ========================================================= | |
| def process_pdf(file_obj): | |
| global DOCS, FAISS_INDEX | |
| if file_obj is None: | |
| yield "β οΈ Please upload a PDF first." | |
| return | |
| try: | |
| yield "π₯ Reading PDF..." | |
| raw = None | |
| if isinstance(file_obj, dict) and "data" in file_obj: | |
| raw = file_obj["data"] | |
| elif hasattr(file_obj, "read"): | |
| raw = file_obj.read() | |
| elif isinstance(file_obj, str) and os.path.exists(file_obj): | |
| with open(file_obj, "rb") as f: | |
| raw = f.read() | |
| if raw is None: | |
| yield f"β Unsupported file type: {type(file_obj)}" | |
| return | |
| yield "βοΈ Extracting text..." | |
| text = extract_text_from_pdf(raw) | |
| if not text.strip(): | |
| yield "β οΈ No extractable text found." | |
| return | |
| yield "π Splitting text into chunks..." | |
| chunks = chunk_text(text) | |
| yield "π§ Creating embeddings..." | |
| DOCS = [{"text": c} for c in chunks] | |
| embs = embed_texts([d["text"] for d in DOCS]) | |
| yield "π¦ Building FAISS index..." | |
| FAISS_INDEX = build_faiss_index(embs) | |
| yield f"β Successfully processed {len(chunks)} chunks." | |
| except Exception as e: | |
| yield f"β Error processing PDF: {str(e)}" | |
| # ========================================================= | |
| # β Answer Question | |
| # ========================================================= | |
| def answer_question(query, history): | |
| if not DOCS or FAISS_INDEX is None: | |
| return history + [["β οΈ Please upload and process a PDF first.", ""]] | |
| related = search_docs(query) | |
| context = "\n\n".join(related) | |
| system_prompt = "You are a helpful assistant answering based on the provided document." | |
| user_prompt = f"Document context:\n{context}\n\nUser question: {query}" | |
| answer = call_groq_chat(system_prompt, user_prompt) | |
| history.append([query, answer]) | |
| return history | |
| # ========================================================= | |
| # β UI Design (Modern Look) | |
| # ========================================================= | |
| with gr.Blocks( | |
| theme=gr.themes.Soft(primary_hue="blue", secondary_hue="gray"), | |
| css=""" | |
| body {background: linear-gradient(135deg, #e3f2fd, #bbdefb);} | |
| .gradio-container {max-width: 900px !important; margin: auto;} | |
| .chatbox {height: 400px; overflow: auto; background: white; border-radius: 12px; | |
| box-shadow: 0 2px 10px rgba(0,0,0,0.1); padding: 10px;} | |
| .status-box {background: #f0f8ff; border-radius: 8px; padding: 10px; color: #333;} | |
| h1 {text-align:center; font-size: 2em; color: #0d47a1;} | |
| """ | |
| ) as app: | |
| gr.Markdown("<h1>π AI PDF Q&A Assistant</h1><p style='text-align:center;'>Powered by Groq + FAISS + Gradio</p>") | |
| with gr.Row(): | |
| pdf_file = gr.File(label="π Upload PDF", file_types=[".pdf"]) | |
| process_btn = gr.Button("βοΈ Process PDF", variant="primary") | |
| status_box = gr.Textbox(label="π Status", elem_classes="status-box", interactive=False) | |
| process_btn.click(process_pdf, inputs=pdf_file, outputs=status_box) | |
| gr.Markdown("### π¬ Ask Questions About Your PDF") | |
| chatbot = gr.Chatbot(label="Chat", elem_classes="chatbox", bubble_full_width=False) | |
| query_box = gr.Textbox(label="Type your question here...") | |
| clear_btn = gr.Button("π§Ή Clear Chat") | |
| query_box.submit(answer_question, [query_box, chatbot], chatbot) | |
| clear_btn.click(lambda: None, None, chatbot, queue=False) | |
| # ========================================================= | |
| # β Launch | |
| # ========================================================= | |
| if __name__ == "__main__": | |
| app.launch() | |