Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import os | |
| from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer | |
| from sentence_transformers import SentenceTransformer | |
| import faiss | |
| import numpy as np | |
| import PyPDF2 | |
| import docx | |
| # ----------------------------- | |
| # 1. Load models | |
| # ----------------------------- | |
| # Embedding model (fast and free) | |
| embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") | |
| # Generation model (free on Hugging Face) | |
| gen_model_name = "google/flan-t5-base" | |
| tokenizer = AutoTokenizer.from_pretrained(gen_model_name) | |
| model = AutoModelForSeq2SeqLM.from_pretrained(gen_model_name) | |
| generator = pipeline("text2text-generation", model=model, tokenizer=tokenizer) | |
| # ----------------------------- | |
| # 2. Global storage | |
| # ----------------------------- | |
| documents = [] | |
| index = None | |
| # ----------------------------- | |
| # 3. Read notes file | |
| # ----------------------------- | |
| def load_notes(file): | |
| """Reads PDF/DOCX/TXT and creates embeddings.""" | |
| global documents, index | |
| text = "" | |
| if file.name.endswith(".pdf"): | |
| reader = PyPDF2.PdfReader(file.name) | |
| for page in reader.pages: | |
| if page.extract_text(): | |
| text += page.extract_text() + " " | |
| elif file.name.endswith(".docx"): | |
| doc = docx.Document(file.name) | |
| for para in doc.paragraphs: | |
| text += para.text + " " | |
| elif file.name.endswith(".txt"): | |
| text += file.read().decode("utf-8") | |
| else: | |
| return "β Unsupported file type. Please upload PDF, DOCX, or TXT." | |
| # Split into overlapping chunks | |
| step = 300 | |
| size = 500 | |
| chunks = [text[i:i+size] for i in range(0, len(text), step)] | |
| documents = chunks | |
| # Embed | |
| embeddings = embedder.encode(chunks, convert_to_numpy=True, normalize_embeddings=True) | |
| dim = embeddings.shape[1] | |
| index = faiss.IndexFlatIP(dim) | |
| index.add(embeddings) | |
| return f"β Notes uploaded successfully! {len(chunks)} chunks created." | |
| # ----------------------------- | |
| # 4. Question answering | |
| # ----------------------------- | |
| def answer_question(question): | |
| """Retrieves top chunks and generates an answer.""" | |
| global documents, index | |
| if not documents or index is None: | |
| return "β Please upload notes first." | |
| # Retrieve top chunks | |
| q_emb = embedder.encode([question], convert_to_numpy=True, normalize_embeddings=True) | |
| D, I = index.search(q_emb, k=3) | |
| retrieved = " ".join([documents[i] for i in I[0]]) | |
| # Generate answer | |
| prompt = f"""You are an expert assistant. | |
| Use only the following notes to answer clearly and professionally. | |
| Notes: | |
| {retrieved} | |
| Question: | |
| {question} | |
| Answer:""" | |
| result = generator(prompt, max_length=500, do_sample=False)[0]["generated_text"] | |
| return result.strip() | |
| # ----------------------------- | |
| # 5. Gradio UI | |
| # ----------------------------- | |
| with gr.Blocks() as demo: | |
| gr.Markdown("## π RAG Chatbot (Ask Questions About Your Notes)") | |
| with gr.Row(): | |
| file_input = gr.File(label="Upload Notes (PDF, DOCX, TXT)") | |
| upload_btn = gr.Button("Upload Notes") | |
| status = gr.Textbox(label="Status") | |
| with gr.Row(): | |
| question = gr.Textbox(label="Ask a Question") | |
| answer = gr.Textbox(label="Answer") | |
| upload_btn.click(load_notes, inputs=file_input, outputs=status) | |
| question.submit(answer_question, inputs=question, outputs=answer) | |
| demo.launch() | |