Bajaj / app.py
jsakshi's picture
Update app.py
ebdce26 verified
# Imports
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_community.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import gradio as gr
import requests
# Step 1: Download the PDF to a local file
pdf_url = "https://huggingface.co/spaces/jsakshi/Bajaj/resolve/main/Arogya%20Sanjeevani%20Policy%20-%20CIN%20-%20U10200WB1906GOI001713%201.pdf"
with open("policy.pdf", "wb") as f:
f.write(requests.get(pdf_url).content)
# Step 2: Load and split PDF
loader = PyPDFLoader("policy.pdf")
pages = loader.load()
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
docs = splitter.split_documents(pages)
# Step 3: Dummy Embeddings (replace with real ones later)
class DummyEmbeddings:
def embed_documents(self, texts): return [[0.0]*1536 for _ in texts]
def embed_query(self, text): return [0.0]*1536
db = Chroma.from_documents(docs, embedding=DummyEmbeddings())
retriever = db.as_retriever()
# Step 4: Load a small open model instead of Mistral
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
model_id = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
llm_pipeline = pipeline("text2text-generation", model=model, tokenizer=tokenizer)
llm = HuggingFacePipeline(pipeline=llm_pipeline)
# Step 5: RAG Chain
qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)
# Step 6: Gradio UI
def answer_query(query):
return qa_chain.run(query)
interface = gr.Interface(
fn=answer_query,
inputs=gr.Textbox(lines=2, placeholder="Ask something about the policy..."),
outputs="text",
title="🧠 RAG Chatbot",
description="Ask questions from the Arogya Sanjeevani insurance policy PDF"
)
interface.launch()