RegBotBeta2.0

Sleeping

File size: 2,904 Bytes

e18671c

import os
import pickle
from json import dumps, loads

import numpy as np
import openai
import pandas as pd
from dotenv import load_dotenv
from huggingface_hub import HfFileSystem
from llama_index import (
    Document,
    GPTVectorStoreIndex,
    LLMPredictor,
    PromptHelper,
    ServiceContext,
    StorageContext,
    load_index_from_storage,
)
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

from utils.customLLM import CustomLLM

load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")
fs = HfFileSystem()

# get model
# model_name = "bigscience/bloom-560m"
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForCausalLM.from_pretrained(model_name, config='T5Config')

# define prompt helper
# set maximum input size
context_window = 2048
# set number of output tokens
num_output = 525
# set maximum chunk overlap
chunk_overlap_ratio = 0.2
prompt_helper = PromptHelper(context_window, num_output, chunk_overlap_ratio)

# create a pipeline
# pl = pipeline(
#     model=model,
#     tokenizer=tokenizer,
#     task="text-generation",
#     # device=0, # GPU device number
#     # max_length=512,
#     do_sample=True,
#     top_p=0.95,
#     top_k=50,
#     temperature=0.7
# )

# define llm
llm_predictor = LLMPredictor(llm=CustomLLM())
service_context = ServiceContext.from_defaults(
    llm_predictor=llm_predictor, prompt_helper=prompt_helper
)


def prepare_data(file_path: str):
    df = pd.read_json(file_path)
    df = df.replace(to_replace="", value=np.nan).dropna(axis=0)  # remove null values

    parsed = loads(df.to_json(orient="records"))

    documents = []
    for item in parsed:
        document = Document(
            text=item["paragraphText"],
            doc_id=item["_id"]["$oid"],
            extra_info={
                "chapter": item["chapter"],
                "article": item["article"],
                "title": item["title"],
            },
        )
        documents.append(document)

    return documents


def initialize_index(index_name):
    file_path = f"./vectorStores/{index_name}"
    if os.path.exists(file_path):
        # rebuild storage context
        storage_context = StorageContext.from_defaults(persist_dir=file_path)

        # local load index access
        index = load_index_from_storage(storage_context)

        # huggingface repo load access
        # with fs.open(file_path, "r") as file:
        #     index = pickle.loads(file.readlines())
        return index
    else:
        documents = prepare_data(r"./assets/regItems.json")
        index = GPTVectorStoreIndex.from_documents(
            documents, service_context=service_context
        )
        # local write access
        index.storage_context.persist(file_path)

        # huggingface repo write access
        # with fs.open(file_path, "w") as file:
        #     file.write(pickle.dumps(index))
        return index