| import gradio as gr |
| import faiss |
| import numpy as np |
| import fitz |
| import docx |
| from pptx import Presentation |
|
|
| from sentence_transformers import SentenceTransformer |
|
|
| embed_model = SentenceTransformer( |
| "sentence-transformers/all-MiniLM-L6-v2" |
| ) |
|
|
| from transformers import AutoTokenizer, AutoModelForSeq2SeqLM |
|
|
| gen_model_name="google/flan-t5-small" |
|
|
| gen_tokenizer=AutoTokenizer.from_pretrained(gen_model_name) |
| gen_model=AutoModelForSeq2SeqLM.from_pretrained(gen_model_name) |
|
|
|
|
| chunks_store=[] |
| index=None |
|
|
| def extract_text(file): |
|
|
| name=file.name.lower() |
|
|
| text="" |
|
|
| if name.endswith(".pdf"): |
| pdf=fitz.open(file.name) |
| for page in pdf: |
| text += page.get_text() |
|
|
| elif name.endswith(".docx"): |
| doc=docx.Document(file.name) |
| for p in doc.paragraphs: |
| text += p.text + "\n" |
|
|
| elif name.endswith(".pptx"): |
| prs=Presentation(file.name) |
|
|
| for slide in prs.slides: |
| for shape in slide.shapes: |
| if hasattr(shape,"text"): |
| text += shape.text + "\n" |
|
|
| else: |
| text="Unsupported file format" |
|
|
| return text |
| |
| def build_kb(file): |
|
|
| global chunks_store,index |
|
|
| text=extract_text(file) |
|
|
| if not text.strip(): |
| return "No text extracted from file." |
|
|
| chunk_size=500 |
| overlap=100 |
|
|
| chunks=[] |
| for i in range( |
| 0, |
| len(text), |
| chunk_size-overlap |
| ): |
| chunks.append( |
| text[i:i+chunk_size] |
| ) |
|
|
| if len(chunks)==0: |
| return "No chunks created." |
|
|
| chunks_store=chunks |
|
|
| embeddings=embed_model.encode(chunks) |
|
|
| dim=embeddings.shape[1] |
|
|
| index=faiss.IndexFlatL2(dim) |
|
|
| index.add( |
| np.array( |
| embeddings |
| ).astype("float32") |
| ) |
|
|
| return f"Knowledge Base Created with {len(chunks)} chunks" |
|
|
| def ask_question(question): |
|
|
| global index, chunks_store |
|
|
| if index is None: |
| return "Upload knowledge first.","" |
|
|
| q_emb = embed_model.encode([question]) |
|
|
| D,I=index.search( |
| np.array(q_emb).astype("float32"), |
| k=2 |
| ) |
|
|
| retrieved="\n\n".join( |
| [chunks_store[i] for i in I[0]] |
| ) |
|
|
| prompt=f""" |
| Use only the provided context. |
| |
| If the answer is not found in the context, reply: |
| Information not found in document. |
| |
| Context: |
| {retrieved} |
| |
| Question: |
| {question} |
| |
| Answer in one concise sentence: |
| """ |
|
|
| inputs=gen_tokenizer( |
| prompt, |
| return_tensors="pt", |
| truncation=True, |
| max_length=512 |
| ) |
|
|
| outputs=gen_model.generate( |
| **inputs, |
| max_new_tokens=35, |
| num_beams=4, |
| do_sample=False, |
| early_stopping=True |
| ) |
|
|
| answer=gen_tokenizer.decode( |
| outputs[0], |
| skip_special_tokens=True |
| ).strip() |
|
|
| if "." in answer: |
| answer = answer.split(".")[0] + "." |
|
|
| return answer,retrieved |
|
|
|
|
| with gr.Blocks() as demo: |
|
|
| gr.Markdown( |
| """ |
| # 📚 RAG Question Answering System |
| Ask questions over your own documents |
| """ |
| ) |
|
|
| with gr.Tab("Build Knowledge Base"): |
|
|
| doc=gr.File( |
| label="Upload PDF / DOCX / PPTX" |
| ) |
|
|
| status=gr.Textbox(label="Status") |
|
|
| build_btn=gr.Button("Create Knowledge Base") |
|
|
| build_btn.click( |
| build_kb, |
| inputs=doc, |
| outputs=status |
| ) |
|
|
|
|
| with gr.Tab("Ask Questions"): |
|
|
| question=gr.Textbox( |
| label="Ask a Question" |
| ) |
|
|
| answer=gr.Textbox( |
| label="Grounded Answer", |
| lines=6 |
| ) |
|
|
| sources=gr.Textbox( |
| label="Sources", |
| lines=8 |
| ) |
|
|
| ask_btn=gr.Button("Ask") |
|
|
| ask_btn.click( |
| ask_question, |
| inputs=question, |
| outputs=[answer,sources] |
| ) |
|
|
|
|
| demo.launch() |