Spaces:

Sourudra
/

RAG_PDF_Chatbot

Sleeping

App Files Files Community

Sourudra commited on Nov 21, 2024

Commit

424a2e7

verified ·

1 Parent(s): 86bbd4a

Create app.py

Browse files

Files changed (1) hide show

app.py +67 -0

app.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import gradio as gr
+import torch
+import numpy as np
+from transformers import DistilBertTokenizer, DistilBertModel
+import faiss
+# Load the DistilBERT model and tokenizer
+tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
+model = DistilBertModel.from_pretrained("distilbert-base-uncased")
+# Example documents to simulate a knowledge base
+documents = [
+    "Python is a programming language that is widely used in data science and machine learning.",
+    "The Eiffel Tower is a famous landmark located in Paris, France.",
+    "Generative Adversarial Networks (GANs) are a class of machine learning models used for image generation.",
+    "Hugging Face is a company specializing in natural language processing and machine learning."
+]
+# Tokenize the documents and create embeddings
+def create_embeddings(documents):
+    embeddings = []
+    for doc in documents:
+        inputs = tokenizer(doc, return_tensors="pt", padding=True, truncation=True, max_length=512)
+        with torch.no_grad():
+            outputs = model(**inputs)
+        embeddings.append(outputs.last_hidden_state.mean(dim=1).squeeze().numpy())
+    return np.array(embeddings)
+# Create FAISS index for document retrieval
+def create_faiss_index(embeddings):
+    index = faiss.IndexFlatL2(embeddings.shape[1])  # Use L2 distance for retrieval
+    index.add(embeddings)
+    return index
+# Create embeddings for the documents and the FAISS index
+document_embeddings = create_embeddings(documents)
+faiss_index = create_faiss_index(document_embeddings)
+# Function to retrieve the most relevant document based on the question
+def retrieve_document(question):
+    # Encode the question into an embedding
+    inputs = tokenizer(question, return_tensors="pt", padding=True, truncation=True, max_length=512)
+    with torch.no_grad():
+        outputs = model(**inputs)
+    question_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
+    # Search for the most relevant document using FAISS
+    D, I = faiss_index.search(np.array([question_embedding]), k=1)
+    return documents[I[0][0]]  # Return the most relevant document
+# Function to answer the question using the retrieved document
+def answer_question(question):
+    retrieved_doc = retrieve_document(question)
+    return f"Retrieved Document: {retrieved_doc}\nAnswer: {retrieved_doc}"
+# Create a Gradio interface for the chatbot
+interface = gr.Interface(
+    fn=answer_question,
+    inputs="text",
+    outputs="text",
+    live=True,
+    title="RAG-Based Question Answering with DistilBERT",
+    description="Ask a question, and I will retrieve the most relevant document to answer it."
+)
+# Launch the Gradio app
+interface.launch()