Spaces:
Sleeping
Sleeping
| import os | |
| import faiss | |
| import gradio as gr | |
| from groq import Groq | |
| from pypdf import PdfReader | |
| from sentence_transformers import SentenceTransformer | |
| import numpy as np | |
| # =============================== | |
| # OPTIONAL: COLAB SECRET SUPPORT | |
| # =============================== | |
| def get_groq_key(): | |
| try: | |
| from google.colab import userdata | |
| key = userdata.get("GROQ_API_KEY") | |
| if key: | |
| print("β Using GROQ key from Colab Secrets") | |
| return key | |
| except: | |
| pass | |
| print("β Using GROQ key from Environment Variables") | |
| return os.environ.get("GROQ_API_KEY") | |
| # =============================== | |
| # CONFIG | |
| # =============================== | |
| EMBED_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2" | |
| LLM_MODEL = "llama-3.3-70b-versatile" | |
| embed_model = SentenceTransformer(EMBED_MODEL_NAME) | |
| client = Groq(api_key=get_groq_key()) | |
| vector_store = None | |
| stored_chunks = [] | |
| # =============================== | |
| # PDF TEXT EXTRACTION | |
| # =============================== | |
| def extract_text_from_pdf(pdf_file): | |
| reader = PdfReader(pdf_file) | |
| text = "" | |
| for page in reader.pages: | |
| page_text = page.extract_text() | |
| if page_text: | |
| text += page_text + "\n" | |
| return text | |
| # =============================== | |
| # TEXT CHUNKING | |
| # =============================== | |
| def chunk_text(text, chunk_size=500, overlap=100): | |
| chunks = [] | |
| start = 0 | |
| while start < len(text): | |
| end = start + chunk_size | |
| chunks.append(text[start:end]) | |
| start += chunk_size - overlap | |
| return chunks | |
| # =============================== | |
| # CREATE VECTOR STORE | |
| # =============================== | |
| def create_vector_store(chunks): | |
| global vector_store, stored_chunks | |
| embeddings = embed_model.encode(chunks) | |
| embeddings = np.array(embeddings).astype("float32") | |
| dimension = embeddings.shape[1] | |
| index = faiss.IndexFlatL2(dimension) | |
| index.add(embeddings) | |
| vector_store = index | |
| stored_chunks = chunks | |
| # =============================== | |
| # FLEXIBLE RETRIEVAL (WITH SCORE) | |
| # =============================== | |
| def retrieve_context(query, k=3): | |
| query_embedding = embed_model.encode([query]) | |
| query_embedding = np.array(query_embedding).astype("float32") | |
| distances, indices = vector_store.search(query_embedding, k) | |
| results = [] | |
| valid_distances = [] | |
| for i, idx in enumerate(indices[0]): | |
| if idx < len(stored_chunks): | |
| results.append(stored_chunks[idx]) | |
| valid_distances.append(distances[0][i]) | |
| avg_distance = np.mean(valid_distances) if valid_distances else 999 | |
| return "\n".join(results), avg_distance | |
| # =============================== | |
| # FLEXIBLE LLM CALL | |
| # =============================== | |
| def ask_llm(context, question, confidence_level): | |
| prompt = f""" | |
| You are a helpful assistant answering questions about a document. | |
| Context from document: | |
| {context} | |
| User Question: | |
| {question} | |
| Confidence Level from retrieval: | |
| {confidence_level} | |
| Instructions: | |
| 1. If confidence is HIGH β Answer as document fact. | |
| 2. If confidence is MEDIUM β Answer but clearly say it is inferred from document. | |
| 3. If confidence is LOW β Answer generally and clearly say it is NOT directly mentioned in document. | |
| Be honest and transparent. | |
| """ | |
| completion = client.chat.completions.create( | |
| model=LLM_MODEL, | |
| messages=[{"role": "user", "content": prompt}], | |
| ) | |
| return completion.choices[0].message.content | |
| # =============================== | |
| # PROCESS PDF SAFE | |
| # =============================== | |
| def process_pdf(file): | |
| try: | |
| if file is None: | |
| return "β Please upload a PDF first." | |
| if not file.name.lower().endswith(".pdf"): | |
| return "β Only PDF files are allowed." | |
| print("Processing file:", file.name) | |
| text = extract_text_from_pdf(file.name) | |
| if len(text.strip()) == 0: | |
| return "β No readable text found (maybe scanned PDF)." | |
| chunks = chunk_text(text) | |
| create_vector_store(chunks) | |
| return f"β Document processed successfully!\nChunks created: {len(chunks)}" | |
| except Exception as e: | |
| return f"β Error processing document: {str(e)}" | |
| # =============================== | |
| # FLEXIBLE ASK QUESTION | |
| # =============================== | |
| def ask_question(question): | |
| try: | |
| if vector_store is None: | |
| return "β οΈ Please upload and process PDF first." | |
| if not question.strip(): | |
| return "β οΈ Please enter a question." | |
| context, distance = retrieve_context(question) | |
| # Confidence thresholds (tunable) | |
| if distance < 0.6: | |
| confidence = "HIGH" | |
| elif distance < 1.2: | |
| confidence = "MEDIUM" | |
| else: | |
| confidence = "LOW" | |
| answer = ask_llm(context, question, confidence) | |
| return f"π Confidence: {confidence}\n\n{answer}" | |
| except Exception as e: | |
| return f"β Error generating answer: {str(e)}" | |
| # =============================== | |
| # RESIZE FIXED BEAUTIFUL UI | |
| # =============================== | |
| custom_css = """ | |
| /* Force full width stable layout */ | |
| .gradio-container { | |
| max-width: 1100px !important; | |
| min-width: 1100px !important; | |
| margin: auto !important; | |
| } | |
| /* Prevent height jump */ | |
| .block-container { | |
| min-height: 700px; | |
| } | |
| /* Card Style */ | |
| .card { | |
| background: rgba(255,255,255,0.04); | |
| padding: 22px; | |
| border-radius: 16px; | |
| border: 1px solid rgba(255,255,255,0.08); | |
| margin-bottom: 20px; | |
| } | |
| /* File Upload Fix */ | |
| input[type="file"] { | |
| min-height: 60px; | |
| } | |
| /* Footer */ | |
| .footer { | |
| text-align:center; | |
| font-size:13px; | |
| color:#888; | |
| margin-top:20px; | |
| } | |
| """ | |
| with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo: | |
| gr.Markdown("# π Flexible RAG PDF APP") | |
| gr.Markdown("Groq LLM β’ FAISS β’ Semantic Search") | |
| with gr.Group(elem_classes="card"): | |
| gr.Markdown("### π Upload Document") | |
| pdf_input = gr.File( | |
| file_types=[".pdf"], | |
| label="Upload PDF" | |
| ) | |
| process_btn = gr.Button("π Process Document", variant="primary") | |
| status_output = gr.Textbox( | |
| label="π Status", | |
| lines=3, | |
| max_lines=10 | |
| ) | |
| with gr.Group(elem_classes="card"): | |
| gr.Markdown("### π€ Ask Questions") | |
| question_input = gr.Textbox( | |
| label="Enter Question", | |
| placeholder="Ask anything about your document..." | |
| ) | |
| ask_btn = gr.Button("π¬ Ask AI", variant="primary") | |
| answer_output = gr.Textbox(label="π§ Answer", lines=10) | |
| gr.HTML("<div class='footer'>Built with β€οΈ using Groq + RAG</div>") | |
| process_btn.click(process_pdf, inputs=pdf_input, outputs=status_output) | |
| ask_btn.click(ask_question, inputs=question_input, outputs=answer_output) | |
| demo.launch(share=True) |