| |
| from langchain_community.document_loaders import PyPDFLoader |
| from langchain.text_splitter import RecursiveCharacterTextSplitter |
| from langchain_community.vectorstores import Chroma |
| from langchain_community.llms import HuggingFacePipeline |
| from langchain.chains import RetrievalQA |
|
|
| from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline |
| import gradio as gr |
| import requests |
|
|
| |
| pdf_url = "https://huggingface.co/spaces/jsakshi/Bajaj/resolve/main/Arogya%20Sanjeevani%20Policy%20-%20CIN%20-%20U10200WB1906GOI001713%201.pdf" |
| with open("policy.pdf", "wb") as f: |
| f.write(requests.get(pdf_url).content) |
|
|
| |
| loader = PyPDFLoader("policy.pdf") |
| pages = loader.load() |
|
|
| splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) |
| docs = splitter.split_documents(pages) |
|
|
| |
| class DummyEmbeddings: |
| def embed_documents(self, texts): return [[0.0]*1536 for _ in texts] |
| def embed_query(self, text): return [0.0]*1536 |
|
|
| db = Chroma.from_documents(docs, embedding=DummyEmbeddings()) |
| retriever = db.as_retriever() |
|
|
| |
|
|
|
|
| from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline |
|
|
| model_id = "google/flan-t5-base" |
| tokenizer = AutoTokenizer.from_pretrained(model_id) |
| model = AutoModelForSeq2SeqLM.from_pretrained(model_id) |
|
|
| llm_pipeline = pipeline("text2text-generation", model=model, tokenizer=tokenizer) |
| llm = HuggingFacePipeline(pipeline=llm_pipeline) |
|
|
|
|
| |
| qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever) |
|
|
| |
| def answer_query(query): |
| return qa_chain.run(query) |
|
|
| interface = gr.Interface( |
| fn=answer_query, |
| inputs=gr.Textbox(lines=2, placeholder="Ask something about the policy..."), |
| outputs="text", |
| title="🧠 RAG Chatbot", |
| description="Ask questions from the Arogya Sanjeevani insurance policy PDF" |
| ) |
|
|
| interface.launch() |
|
|