import csv from llama_index.core.schema import Document from llama_index.core.node_parser import SimpleNodeParser, SentenceSplitter from llama_index.core.schema import IndexNode from llama_index.core import ServiceContext from llama_index.core.query_engine.retriever_query_engine import RetrieverQueryEngine from llama_index.embeddings.huggingface.base import HuggingFaceEmbedding from llama_index.vector_stores.faiss import FaissVectorStore from llama_index.core import SimpleDirectoryReader, StorageContext, VectorStoreIndex import faiss from llama_index.core.settings import Settings from llama_index.llms.openai import OpenAI from transformers import BitsAndBytesConfig from llama_index.core.prompts import PromptTemplate import torch import pandas as pd import os import json #from IPython.display import Markdown, display os.environ["OPENAI_API_KEY"]='use a registered OpenAI API key' # def get_query_data(file_name): # # with open("unique_questions.tsv",'r', encoding='UTF-8') as file: # # tsv_file = csv.reader(file, delimiter="\t") # df = pd.read_csv(file_name, delimiter="\t") # n = len(df) # query_ls = [] # for i in range (n): # row = df.iloc[i] # query = row['questions'] # query_ls.append(query) # return query_ls def get_squad_question(): with open('data_sample/squad_dev-v2.0.json', 'r') as file: squad = json.load(file) node_ls = [] question_ls = [] for i in range (len(squad['data'])): x = len(squad['data'][i]['paragraphs']) for j in range(x): #context = squad['data'][i]['paragraphs'][j]['context'] #node_ls.append(context) y = len(squad['data'][i]['paragraphs'][j]['qas']) for k in range (y): ques = squad['data'][i]['paragraphs'][j]['qas'][k]['question'] question_ls.append(ques) return question_ls def index_gen(file_name): #documents = SimpleDirectoryReader(input_files=["squad_contexts.json"]).load_data() #documents = SimpleDirectoryReader(input_files=["data_sample/Library_data/accessions_content+uri.json"]).load_data() reader = SimpleDirectoryReader(input_dir="data_sample/Library_data/ASpaceAccessions/IRC_accessions_txt", required_exts=[".txt"]) documents = reader.load_data() embedding_dim = 384 #dimension = embedding_dim # Ensure this matches your embedding model's output dimension faiss_index = faiss.IndexFlatL2(embedding_dim) vector_store = FaissVectorStore(faiss_index=faiss_index) storage_context = StorageContext.from_defaults(vector_store=vector_store) parser = SentenceSplitter() nodes = parser.get_nodes_from_documents(documents) for node, doc in zip(nodes, documents): node.metadata["source"] = doc.metadata.get("file_name", "unknown") index = VectorStoreIndex(nodes, storage_context = storage_context) return index torch.cuda.empty_cache() device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') file_name = "unique_questions.tsv" embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5") Settings.embed_model = embed_model custom_prompt = PromptTemplate( template=( "Use the following context to answer the query. Do not use outside knowledge. " "If the answer is not found in the context, respond with: 'I do not have the answer.'\n" "Context: {context_str}\n" "Query: {query_str}\n" "Answer:" ) ) llm = OpenAI(temperature=0, model="gpt-4o-mini", max_tokens=512) Settings.llm = llm Settings.chunk_size = 512 Settings.chunk_overlap = 10 file_name = "data_sample/Library_data/new_IRC_accessions.json" index = index_gen(file_name) nodes = index.docstore.docs.values() retriever = index.as_retriever(similarity_top_k=2) #query_engine = index.as_query_engine() query_engine = RetrieverQueryEngine(retriever=retriever) query_engine.update_prompts({"response_synthesizer:text_qa_template": custom_prompt}) prompts_dict = query_engine.get_prompts() # Example queries from a part of the UKY Libraries data response1 = query_engine.query("When did the Faculty council and curriculam committee establish?") y = response1.metadata.values() x = list(y) file_name_res1 = x[0]['file_name'] #response1.source_nodes[0].metadata['file_name'] % another way to retrieve file name response2 = query_engine.query("Who did design the house in the photograph?") y1 = response2.metadata.values() x1 = list(y1) file_name_res2 = x1[0]['file_name'] response3 = query_engine.query("What are the contents available on Ferrel Wellman's career?") y2 = response3.metadata.values() x2 = list(y2) file_name_res3 = x2[0]['file_name'] response4 = query_engine.query("How many boxes of materials related to Good food Co-op are available in the library?") y3 = response4.metadata.values() x3 = list(y3) file_name_res4 = x3[0]['file_name'] response5 = query_engine.query("What was the purpose of the KMF?") y4 = response5.metadata.values() x4 = list(y4) file_name_res5 = x4[0]['file_name'] print(response1.response, file_name_res1) print("\n") print(response2.response, file_name_res2) print("\n") print(response3.response, file_name_res3) print("\n") print(response4.response, file_name_res4) print("\n") print(response5.response, file_name_res5)