RegBotBeta2.0

Sleeping

Hung Bui

Duplicate from zhtet/RegBotBeta

e18671c over 2 years ago

2.9 kB

	import os
	import pickle
	from json import dumps, loads

	import numpy as np
	import openai
	import pandas as pd
	from dotenv import load_dotenv
	from huggingface_hub import HfFileSystem
	from llama_index import (
	Document,
	GPTVectorStoreIndex,
	LLMPredictor,
	PromptHelper,
	ServiceContext,
	StorageContext,
	load_index_from_storage,
	)
	from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

	from utils.customLLM import CustomLLM

	load_dotenv()
	openai.api_key = os.getenv("OPENAI_API_KEY")
	fs = HfFileSystem()

	# get model
	# model_name = "bigscience/bloom-560m"
	# tokenizer = AutoTokenizer.from_pretrained(model_name)
	# model = AutoModelForCausalLM.from_pretrained(model_name, config='T5Config')

	# define prompt helper
	# set maximum input size
	context_window = 2048
	# set number of output tokens
	num_output = 525
	# set maximum chunk overlap
	chunk_overlap_ratio = 0.2
	prompt_helper = PromptHelper(context_window, num_output, chunk_overlap_ratio)

	# create a pipeline
	# pl = pipeline(
	# model=model,
	# tokenizer=tokenizer,
	# task="text-generation",
	# # device=0, # GPU device number
	# # max_length=512,
	# do_sample=True,
	# top_p=0.95,
	# top_k=50,
	# temperature=0.7
	# )

	# define llm
	llm_predictor = LLMPredictor(llm=CustomLLM())
	service_context = ServiceContext.from_defaults(
	llm_predictor=llm_predictor, prompt_helper=prompt_helper
	)


	def prepare_data(file_path: str):
	df = pd.read_json(file_path)
	df = df.replace(to_replace="", value=np.nan).dropna(axis=0) # remove null values

	parsed = loads(df.to_json(orient="records"))

	documents = []
	for item in parsed:
	document = Document(
	text=item["paragraphText"],
	doc_id=item["_id"]["$oid"],
	extra_info={
	"chapter": item["chapter"],
	"article": item["article"],
	"title": item["title"],
	},
	)
	documents.append(document)

	return documents


	def initialize_index(index_name):
	file_path = f"./vectorStores/{index_name}"
	if os.path.exists(file_path):
	# rebuild storage context
	storage_context = StorageContext.from_defaults(persist_dir=file_path)

	# local load index access
	index = load_index_from_storage(storage_context)

	# huggingface repo load access
	# with fs.open(file_path, "r") as file:
	# index = pickle.loads(file.readlines())
	return index
	else:
	documents = prepare_data(r"./assets/regItems.json")
	index = GPTVectorStoreIndex.from_documents(
	documents, service_context=service_context
	)
	# local write access
	index.storage_context.persist(file_path)

	# huggingface repo write access
	# with fs.open(file_path, "w") as file:
	# file.write(pickle.dumps(index))
	return index