Spaces:
Sleeping
Sleeping
Create rag_system.py
Browse files- rag_system.py +61 -0
rag_system.py
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from langchain_huggingface import HuggingFaceEmbeddings
|
| 4 |
+
from langchain_community.vectorstores import FAISS
|
| 5 |
+
from langchain.text_splitter import CharacterTextSplitter
|
| 6 |
+
from langchain.docstore.document import Document
|
| 7 |
+
from transformers import pipeline
|
| 8 |
+
from langchain.prompts import PromptTemplate
|
| 9 |
+
|
| 10 |
+
class RAGSystem:
|
| 11 |
+
def __init__(self, csv_path="apparel.csv"):
|
| 12 |
+
self.setup_system(csv_path)
|
| 13 |
+
self.qa_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")
|
| 14 |
+
|
| 15 |
+
def setup_system(self, csv_path):
|
| 16 |
+
if not os.path.exists(csv_path):
|
| 17 |
+
raise FileNotFoundError(f"CSV file not found at {csv_path}")
|
| 18 |
+
|
| 19 |
+
# Read the CSV file
|
| 20 |
+
documents = pd.read_csv(csv_path)
|
| 21 |
+
|
| 22 |
+
# Create proper Document objects
|
| 23 |
+
docs = [
|
| 24 |
+
Document(
|
| 25 |
+
page_content=str(row['Title']), # Convert to string to ensure compatibility
|
| 26 |
+
metadata={'index': idx}
|
| 27 |
+
)
|
| 28 |
+
for idx, row in documents.iterrows()
|
| 29 |
+
]
|
| 30 |
+
|
| 31 |
+
# Split documents
|
| 32 |
+
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
|
| 33 |
+
split_docs = text_splitter.split_documents(docs)
|
| 34 |
+
|
| 35 |
+
# Create embeddings and vector store
|
| 36 |
+
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
|
| 37 |
+
self.vector_store = FAISS.from_documents(split_docs, embeddings)
|
| 38 |
+
self.retriever = self.vector_store.as_retriever()
|
| 39 |
+
|
| 40 |
+
def process_query(self, query):
|
| 41 |
+
# Retrieve documents based on the query
|
| 42 |
+
retrieved_docs = self.retriever.get_relevant_documents(query) # Changed from invoke to get_relevant_documents
|
| 43 |
+
|
| 44 |
+
# Properly access page_content from Document objects
|
| 45 |
+
retrieved_text = "\n".join([doc.page_content for doc in retrieved_docs])[:1000]
|
| 46 |
+
|
| 47 |
+
# Process with QA pipeline
|
| 48 |
+
qa_input = {
|
| 49 |
+
"question": query,
|
| 50 |
+
"context": retrieved_text
|
| 51 |
+
}
|
| 52 |
+
response = self.qa_pipeline(qa_input)
|
| 53 |
+
|
| 54 |
+
return response['answer']
|
| 55 |
+
|
| 56 |
+
def get_similar_documents(self, query, k=5):
|
| 57 |
+
"""
|
| 58 |
+
Retrieve similar documents without processing through QA pipeline
|
| 59 |
+
"""
|
| 60 |
+
docs = self.retriever.get_relevant_documents(query)
|
| 61 |
+
return [{'content': doc.page_content, 'metadata': doc.metadata} for doc in docs[:k]]
|