Spaces:
Sleeping
Sleeping
| import csv | |
| from llama_index.core.schema import Document | |
| from llama_index.core.node_parser import SimpleNodeParser, SentenceSplitter | |
| from llama_index.core.schema import IndexNode | |
| from llama_index.core import ServiceContext | |
| from llama_index.core.query_engine.retriever_query_engine import RetrieverQueryEngine | |
| from llama_index.embeddings.huggingface.base import HuggingFaceEmbedding | |
| from llama_index.vector_stores.faiss import FaissVectorStore | |
| from llama_index.core import SimpleDirectoryReader, StorageContext, VectorStoreIndex | |
| import faiss | |
| from llama_index.core.settings import Settings | |
| from llama_index.llms.openai import OpenAI | |
| from transformers import BitsAndBytesConfig | |
| from llama_index.core.prompts import PromptTemplate | |
| import torch | |
| import pandas as pd | |
| import os | |
| import json | |
| #from IPython.display import Markdown, display | |
| os.environ["OPENAI_API_KEY"]='use a registered OpenAI API key' | |
| # def get_query_data(file_name): | |
| # # with open("unique_questions.tsv",'r', encoding='UTF-8') as file: | |
| # # tsv_file = csv.reader(file, delimiter="\t") | |
| # df = pd.read_csv(file_name, delimiter="\t") | |
| # n = len(df) | |
| # query_ls = [] | |
| # for i in range (n): | |
| # row = df.iloc[i] | |
| # query = row['questions'] | |
| # query_ls.append(query) | |
| # return query_ls | |
| def get_squad_question(): | |
| with open('data_sample/squad_dev-v2.0.json', 'r') as file: | |
| squad = json.load(file) | |
| node_ls = [] | |
| question_ls = [] | |
| for i in range (len(squad['data'])): | |
| x = len(squad['data'][i]['paragraphs']) | |
| for j in range(x): | |
| #context = squad['data'][i]['paragraphs'][j]['context'] | |
| #node_ls.append(context) | |
| y = len(squad['data'][i]['paragraphs'][j]['qas']) | |
| for k in range (y): | |
| ques = squad['data'][i]['paragraphs'][j]['qas'][k]['question'] | |
| question_ls.append(ques) | |
| return question_ls | |
| def index_gen(file_name): | |
| #documents = SimpleDirectoryReader(input_files=["squad_contexts.json"]).load_data() | |
| #documents = SimpleDirectoryReader(input_files=["data_sample/Library_data/accessions_content+uri.json"]).load_data() | |
| reader = SimpleDirectoryReader(input_dir="data_sample/Library_data/ASpaceAccessions/IRC_accessions_txt", required_exts=[".txt"]) | |
| documents = reader.load_data() | |
| embedding_dim = 384 | |
| #dimension = embedding_dim # Ensure this matches your embedding model's output dimension | |
| faiss_index = faiss.IndexFlatL2(embedding_dim) | |
| vector_store = FaissVectorStore(faiss_index=faiss_index) | |
| storage_context = StorageContext.from_defaults(vector_store=vector_store) | |
| parser = SentenceSplitter() | |
| nodes = parser.get_nodes_from_documents(documents) | |
| for node, doc in zip(nodes, documents): | |
| node.metadata["source"] = doc.metadata.get("file_name", "unknown") | |
| index = VectorStoreIndex(nodes, storage_context = storage_context) | |
| return index | |
| torch.cuda.empty_cache() | |
| device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') | |
| file_name = "unique_questions.tsv" | |
| embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5") | |
| Settings.embed_model = embed_model | |
| custom_prompt = PromptTemplate( | |
| template=( | |
| "Use the following context to answer the query. Do not use outside knowledge. " | |
| "If the answer is not found in the context, respond with: 'I do not have the answer.'\n" | |
| "Context: {context_str}\n" | |
| "Query: {query_str}\n" | |
| "Answer:" | |
| ) | |
| ) | |
| llm = OpenAI(temperature=0, model="gpt-4o-mini", max_tokens=512) | |
| Settings.llm = llm | |
| Settings.chunk_size = 512 | |
| Settings.chunk_overlap = 10 | |
| file_name = "data_sample/Library_data/new_IRC_accessions.json" | |
| index = index_gen(file_name) | |
| nodes = index.docstore.docs.values() | |
| retriever = index.as_retriever(similarity_top_k=2) | |
| #query_engine = index.as_query_engine() | |
| query_engine = RetrieverQueryEngine(retriever=retriever) | |
| query_engine.update_prompts({"response_synthesizer:text_qa_template": custom_prompt}) | |
| prompts_dict = query_engine.get_prompts() | |
| # Example queries from a part of the UKY Libraries data | |
| response1 = query_engine.query("When did the Faculty council and curriculam committee establish?") | |
| y = response1.metadata.values() | |
| x = list(y) | |
| file_name_res1 = x[0]['file_name'] | |
| #response1.source_nodes[0].metadata['file_name'] % another way to retrieve file name | |
| response2 = query_engine.query("Who did design the house in the photograph?") | |
| y1 = response2.metadata.values() | |
| x1 = list(y1) | |
| file_name_res2 = x1[0]['file_name'] | |
| response3 = query_engine.query("What are the contents available on Ferrel Wellman's career?") | |
| y2 = response3.metadata.values() | |
| x2 = list(y2) | |
| file_name_res3 = x2[0]['file_name'] | |
| response4 = query_engine.query("How many boxes of materials related to Good food Co-op are available in the library?") | |
| y3 = response4.metadata.values() | |
| x3 = list(y3) | |
| file_name_res4 = x3[0]['file_name'] | |
| response5 = query_engine.query("What was the purpose of the KMF?") | |
| y4 = response5.metadata.values() | |
| x4 = list(y4) | |
| file_name_res5 = x4[0]['file_name'] | |
| print(response1.response, file_name_res1) | |
| print("\n") | |
| print(response2.response, file_name_res2) | |
| print("\n") | |
| print(response3.response, file_name_res3) | |
| print("\n") | |
| print(response4.response, file_name_res4) | |
| print("\n") | |
| print(response5.response, file_name_res5) | |