Spaces:
Sleeping
Sleeping
File size: 5,491 Bytes
8800ef0 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 |
import csv
from llama_index.core.schema import Document
from llama_index.core.node_parser import SimpleNodeParser, SentenceSplitter
from llama_index.core.schema import IndexNode
from llama_index.core import ServiceContext
from llama_index.core.query_engine.retriever_query_engine import RetrieverQueryEngine
from llama_index.embeddings.huggingface.base import HuggingFaceEmbedding
from llama_index.vector_stores.faiss import FaissVectorStore
from llama_index.core import SimpleDirectoryReader, StorageContext, VectorStoreIndex
import faiss
from llama_index.core.settings import Settings
from llama_index.llms.openai import OpenAI
from transformers import BitsAndBytesConfig
from llama_index.core.prompts import PromptTemplate
import torch
import pandas as pd
import os
import json
#from IPython.display import Markdown, display
os.environ["OPENAI_API_KEY"]='use a registered OpenAI API key'
# def get_query_data(file_name):
# # with open("unique_questions.tsv",'r', encoding='UTF-8') as file:
# # tsv_file = csv.reader(file, delimiter="\t")
# df = pd.read_csv(file_name, delimiter="\t")
# n = len(df)
# query_ls = []
# for i in range (n):
# row = df.iloc[i]
# query = row['questions']
# query_ls.append(query)
# return query_ls
def get_squad_question():
with open('data_sample/squad_dev-v2.0.json', 'r') as file:
squad = json.load(file)
node_ls = []
question_ls = []
for i in range (len(squad['data'])):
x = len(squad['data'][i]['paragraphs'])
for j in range(x):
#context = squad['data'][i]['paragraphs'][j]['context']
#node_ls.append(context)
y = len(squad['data'][i]['paragraphs'][j]['qas'])
for k in range (y):
ques = squad['data'][i]['paragraphs'][j]['qas'][k]['question']
question_ls.append(ques)
return question_ls
def index_gen(file_name):
#documents = SimpleDirectoryReader(input_files=["squad_contexts.json"]).load_data()
#documents = SimpleDirectoryReader(input_files=["data_sample/Library_data/accessions_content+uri.json"]).load_data()
reader = SimpleDirectoryReader(input_dir="data_sample/Library_data/ASpaceAccessions/IRC_accessions_txt", required_exts=[".txt"])
documents = reader.load_data()
embedding_dim = 384
#dimension = embedding_dim # Ensure this matches your embedding model's output dimension
faiss_index = faiss.IndexFlatL2(embedding_dim)
vector_store = FaissVectorStore(faiss_index=faiss_index)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
parser = SentenceSplitter()
nodes = parser.get_nodes_from_documents(documents)
for node, doc in zip(nodes, documents):
node.metadata["source"] = doc.metadata.get("file_name", "unknown")
index = VectorStoreIndex(nodes, storage_context = storage_context)
return index
torch.cuda.empty_cache()
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
file_name = "unique_questions.tsv"
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
Settings.embed_model = embed_model
custom_prompt = PromptTemplate(
template=(
"Use the following context to answer the query. Do not use outside knowledge. "
"If the answer is not found in the context, respond with: 'I do not have the answer.'\n"
"Context: {context_str}\n"
"Query: {query_str}\n"
"Answer:"
)
)
llm = OpenAI(temperature=0, model="gpt-4o-mini", max_tokens=512)
Settings.llm = llm
Settings.chunk_size = 512
Settings.chunk_overlap = 10
file_name = "data_sample/Library_data/new_IRC_accessions.json"
index = index_gen(file_name)
nodes = index.docstore.docs.values()
retriever = index.as_retriever(similarity_top_k=2)
#query_engine = index.as_query_engine()
query_engine = RetrieverQueryEngine(retriever=retriever)
query_engine.update_prompts({"response_synthesizer:text_qa_template": custom_prompt})
prompts_dict = query_engine.get_prompts()
# Example queries from a part of the UKY Libraries data
response1 = query_engine.query("When did the Faculty council and curriculam committee establish?")
y = response1.metadata.values()
x = list(y)
file_name_res1 = x[0]['file_name']
#response1.source_nodes[0].metadata['file_name'] % another way to retrieve file name
response2 = query_engine.query("Who did design the house in the photograph?")
y1 = response2.metadata.values()
x1 = list(y1)
file_name_res2 = x1[0]['file_name']
response3 = query_engine.query("What are the contents available on Ferrel Wellman's career?")
y2 = response3.metadata.values()
x2 = list(y2)
file_name_res3 = x2[0]['file_name']
response4 = query_engine.query("How many boxes of materials related to Good food Co-op are available in the library?")
y3 = response4.metadata.values()
x3 = list(y3)
file_name_res4 = x3[0]['file_name']
response5 = query_engine.query("What was the purpose of the KMF?")
y4 = response5.metadata.values()
x4 = list(y4)
file_name_res5 = x4[0]['file_name']
print(response1.response, file_name_res1)
print("\n")
print(response2.response, file_name_res2)
print("\n")
print(response3.response, file_name_res3)
print("\n")
print(response4.response, file_name_res4)
print("\n")
print(response5.response, file_name_res5)
|