File size: 5,491 Bytes
8800ef0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158

import csv

from llama_index.core.schema import Document
from llama_index.core.node_parser import SimpleNodeParser, SentenceSplitter
from llama_index.core.schema import IndexNode
from llama_index.core import ServiceContext
from llama_index.core.query_engine.retriever_query_engine import RetrieverQueryEngine
from llama_index.embeddings.huggingface.base import HuggingFaceEmbedding
from llama_index.vector_stores.faiss import FaissVectorStore
from llama_index.core import SimpleDirectoryReader, StorageContext, VectorStoreIndex
import faiss
from llama_index.core.settings import Settings
from llama_index.llms.openai import OpenAI
from transformers import BitsAndBytesConfig
from llama_index.core.prompts import PromptTemplate
import torch
import pandas as pd
import os
import json
#from IPython.display import Markdown, display

os.environ["OPENAI_API_KEY"]='use a registered OpenAI API key'


# def get_query_data(file_name):
#     # with open("unique_questions.tsv",'r', encoding='UTF-8') as file:
#     #     tsv_file = csv.reader(file, delimiter="\t")

#     df = pd.read_csv(file_name, delimiter="\t")
#     n = len(df)
#     query_ls = []
#     for i in range (n):
#         row = df.iloc[i]
#         query = row['questions']
#         query_ls.append(query)

#     return query_ls

def get_squad_question():
    with open('data_sample/squad_dev-v2.0.json', 'r') as file:
        squad = json.load(file)

    node_ls = []
    question_ls = []

    for i in range (len(squad['data'])):
        x = len(squad['data'][i]['paragraphs'])
        for j in range(x):
            #context = squad['data'][i]['paragraphs'][j]['context']
            #node_ls.append(context)
            y = len(squad['data'][i]['paragraphs'][j]['qas'])
            for k in range (y):
                ques = squad['data'][i]['paragraphs'][j]['qas'][k]['question']
                question_ls.append(ques)
    
    return question_ls

def index_gen(file_name):
    #documents = SimpleDirectoryReader(input_files=["squad_contexts.json"]).load_data()
    #documents = SimpleDirectoryReader(input_files=["data_sample/Library_data/accessions_content+uri.json"]).load_data()
    reader = SimpleDirectoryReader(input_dir="data_sample/Library_data/ASpaceAccessions/IRC_accessions_txt", required_exts=[".txt"])
    documents = reader.load_data()

    embedding_dim = 384

    #dimension = embedding_dim  # Ensure this matches your embedding model's output dimension
    faiss_index = faiss.IndexFlatL2(embedding_dim)
    vector_store = FaissVectorStore(faiss_index=faiss_index)
    storage_context = StorageContext.from_defaults(vector_store=vector_store)

    parser = SentenceSplitter()
    nodes = parser.get_nodes_from_documents(documents)
    for node, doc in zip(nodes, documents):
        node.metadata["source"] = doc.metadata.get("file_name", "unknown")
    index = VectorStoreIndex(nodes, storage_context = storage_context)

    return index


torch.cuda.empty_cache()
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
file_name = "unique_questions.tsv"
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
Settings.embed_model = embed_model


custom_prompt = PromptTemplate(
    template=(
        "Use the following context to answer the query. Do not use outside knowledge. "
        "If the answer is not found in the context, respond with: 'I do not have the answer.'\n"
        "Context: {context_str}\n"
        "Query: {query_str}\n"
        "Answer:"
    )
)

llm = OpenAI(temperature=0, model="gpt-4o-mini", max_tokens=512)

Settings.llm = llm
Settings.chunk_size = 512
Settings.chunk_overlap = 10

file_name = "data_sample/Library_data/new_IRC_accessions.json"

index = index_gen(file_name)

nodes = index.docstore.docs.values()


retriever = index.as_retriever(similarity_top_k=2)
#query_engine = index.as_query_engine()
query_engine = RetrieverQueryEngine(retriever=retriever)

query_engine.update_prompts({"response_synthesizer:text_qa_template": custom_prompt})

prompts_dict = query_engine.get_prompts()

# Example queries from a part of the UKY Libraries data
response1 = query_engine.query("When did the Faculty council and curriculam committee establish?")
y = response1.metadata.values()
x = list(y)
file_name_res1 = x[0]['file_name'] 
#response1.source_nodes[0].metadata['file_name'] % another way to retrieve file name
response2 = query_engine.query("Who did design the house in the photograph?")
y1 = response2.metadata.values()
x1 = list(y1)
file_name_res2 = x1[0]['file_name']

response3 = query_engine.query("What are the contents available on Ferrel Wellman's career?")
y2 = response3.metadata.values()
x2 = list(y2)
file_name_res3 = x2[0]['file_name']

response4 = query_engine.query("How many boxes of materials related to Good food Co-op are available in the library?")
y3 = response4.metadata.values()
x3 = list(y3)
file_name_res4 = x3[0]['file_name']

response5 = query_engine.query("What was the purpose of the KMF?")
y4 = response5.metadata.values()
x4 = list(y4)
file_name_res5 = x4[0]['file_name']

print(response1.response, file_name_res1)
print("\n")
print(response2.response, file_name_res2)
print("\n")
print(response3.response, file_name_res3)
print("\n")
print(response4.response, file_name_res4)
print("\n")
print(response5.response, file_name_res5)