RAG-Lib / rag_model_uky_libs_api.py
JaMussCraft's picture
Upload folder using huggingface_hub
8800ef0 verified
import csv
from llama_index.core.schema import Document
from llama_index.core.node_parser import SimpleNodeParser, SentenceSplitter
from llama_index.core.schema import IndexNode
from llama_index.core import ServiceContext
from llama_index.core.query_engine.retriever_query_engine import RetrieverQueryEngine
from llama_index.embeddings.huggingface.base import HuggingFaceEmbedding
from llama_index.vector_stores.faiss import FaissVectorStore
from llama_index.core import SimpleDirectoryReader, StorageContext, VectorStoreIndex
import faiss
from llama_index.core.settings import Settings
from llama_index.llms.openai import OpenAI
from transformers import BitsAndBytesConfig
from llama_index.core.prompts import PromptTemplate
import torch
import pandas as pd
import os
import json
#from IPython.display import Markdown, display
os.environ["OPENAI_API_KEY"]='use a registered OpenAI API key'
# def get_query_data(file_name):
# # with open("unique_questions.tsv",'r', encoding='UTF-8') as file:
# # tsv_file = csv.reader(file, delimiter="\t")
# df = pd.read_csv(file_name, delimiter="\t")
# n = len(df)
# query_ls = []
# for i in range (n):
# row = df.iloc[i]
# query = row['questions']
# query_ls.append(query)
# return query_ls
def get_squad_question():
with open('data_sample/squad_dev-v2.0.json', 'r') as file:
squad = json.load(file)
node_ls = []
question_ls = []
for i in range (len(squad['data'])):
x = len(squad['data'][i]['paragraphs'])
for j in range(x):
#context = squad['data'][i]['paragraphs'][j]['context']
#node_ls.append(context)
y = len(squad['data'][i]['paragraphs'][j]['qas'])
for k in range (y):
ques = squad['data'][i]['paragraphs'][j]['qas'][k]['question']
question_ls.append(ques)
return question_ls
def index_gen(file_name):
#documents = SimpleDirectoryReader(input_files=["squad_contexts.json"]).load_data()
#documents = SimpleDirectoryReader(input_files=["data_sample/Library_data/accessions_content+uri.json"]).load_data()
reader = SimpleDirectoryReader(input_dir="data_sample/Library_data/ASpaceAccessions/IRC_accessions_txt", required_exts=[".txt"])
documents = reader.load_data()
embedding_dim = 384
#dimension = embedding_dim # Ensure this matches your embedding model's output dimension
faiss_index = faiss.IndexFlatL2(embedding_dim)
vector_store = FaissVectorStore(faiss_index=faiss_index)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
parser = SentenceSplitter()
nodes = parser.get_nodes_from_documents(documents)
for node, doc in zip(nodes, documents):
node.metadata["source"] = doc.metadata.get("file_name", "unknown")
index = VectorStoreIndex(nodes, storage_context = storage_context)
return index
torch.cuda.empty_cache()
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
file_name = "unique_questions.tsv"
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
Settings.embed_model = embed_model
custom_prompt = PromptTemplate(
template=(
"Use the following context to answer the query. Do not use outside knowledge. "
"If the answer is not found in the context, respond with: 'I do not have the answer.'\n"
"Context: {context_str}\n"
"Query: {query_str}\n"
"Answer:"
)
)
llm = OpenAI(temperature=0, model="gpt-4o-mini", max_tokens=512)
Settings.llm = llm
Settings.chunk_size = 512
Settings.chunk_overlap = 10
file_name = "data_sample/Library_data/new_IRC_accessions.json"
index = index_gen(file_name)
nodes = index.docstore.docs.values()
retriever = index.as_retriever(similarity_top_k=2)
#query_engine = index.as_query_engine()
query_engine = RetrieverQueryEngine(retriever=retriever)
query_engine.update_prompts({"response_synthesizer:text_qa_template": custom_prompt})
prompts_dict = query_engine.get_prompts()
# Example queries from a part of the UKY Libraries data
response1 = query_engine.query("When did the Faculty council and curriculam committee establish?")
y = response1.metadata.values()
x = list(y)
file_name_res1 = x[0]['file_name']
#response1.source_nodes[0].metadata['file_name'] % another way to retrieve file name
response2 = query_engine.query("Who did design the house in the photograph?")
y1 = response2.metadata.values()
x1 = list(y1)
file_name_res2 = x1[0]['file_name']
response3 = query_engine.query("What are the contents available on Ferrel Wellman's career?")
y2 = response3.metadata.values()
x2 = list(y2)
file_name_res3 = x2[0]['file_name']
response4 = query_engine.query("How many boxes of materials related to Good food Co-op are available in the library?")
y3 = response4.metadata.values()
x3 = list(y3)
file_name_res4 = x3[0]['file_name']
response5 = query_engine.query("What was the purpose of the KMF?")
y4 = response5.metadata.values()
x4 = list(y4)
file_name_res5 = x4[0]['file_name']
print(response1.response, file_name_res1)
print("\n")
print(response2.response, file_name_res2)
print("\n")
print(response3.response, file_name_res3)
print("\n")
print(response4.response, file_name_res4)
print("\n")
print(response5.response, file_name_res5)