Spaces:
Sleeping
Sleeping
| import os | |
| import pickle | |
| from json import dumps, loads | |
| import numpy as np | |
| import openai | |
| import pandas as pd | |
| from dotenv import load_dotenv | |
| from huggingface_hub import HfFileSystem | |
| from llama_index import ( | |
| Document, | |
| GPTVectorStoreIndex, | |
| LLMPredictor, | |
| PromptHelper, | |
| ServiceContext, | |
| StorageContext, | |
| load_index_from_storage, | |
| ) | |
| from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline | |
| from utils.customLLM import CustomLLM | |
| load_dotenv() | |
| openai.api_key = os.getenv("OPENAI_API_KEY") | |
| fs = HfFileSystem() | |
| # get model | |
| # model_name = "bigscience/bloom-560m" | |
| # tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| # model = AutoModelForCausalLM.from_pretrained(model_name, config='T5Config') | |
| # define prompt helper | |
| # set maximum input size | |
| context_window = 2048 | |
| # set number of output tokens | |
| num_output = 525 | |
| # set maximum chunk overlap | |
| chunk_overlap_ratio = 0.2 | |
| prompt_helper = PromptHelper(context_window, num_output, chunk_overlap_ratio) | |
| # create a pipeline | |
| # pl = pipeline( | |
| # model=model, | |
| # tokenizer=tokenizer, | |
| # task="text-generation", | |
| # # device=0, # GPU device number | |
| # # max_length=512, | |
| # do_sample=True, | |
| # top_p=0.95, | |
| # top_k=50, | |
| # temperature=0.7 | |
| # ) | |
| # define llm | |
| llm_predictor = LLMPredictor(llm=CustomLLM()) | |
| service_context = ServiceContext.from_defaults( | |
| llm_predictor=llm_predictor, prompt_helper=prompt_helper | |
| ) | |
| def prepare_data(file_path: str): | |
| df = pd.read_json(file_path) | |
| df = df.replace(to_replace="", value=np.nan).dropna(axis=0) # remove null values | |
| parsed = loads(df.to_json(orient="records")) | |
| documents = [] | |
| for item in parsed: | |
| document = Document( | |
| text=item["paragraphText"], | |
| doc_id=item["_id"]["$oid"], | |
| extra_info={ | |
| "chapter": item["chapter"], | |
| "article": item["article"], | |
| "title": item["title"], | |
| }, | |
| ) | |
| documents.append(document) | |
| return documents | |
| def initialize_index(index_name): | |
| file_path = f"./vectorStores/{index_name}" | |
| if os.path.exists(file_path): | |
| # rebuild storage context | |
| storage_context = StorageContext.from_defaults(persist_dir=file_path) | |
| # local load index access | |
| index = load_index_from_storage(storage_context) | |
| # huggingface repo load access | |
| # with fs.open(file_path, "r") as file: | |
| # index = pickle.loads(file.readlines()) | |
| return index | |
| else: | |
| documents = prepare_data(r"./assets/regItems.json") | |
| index = GPTVectorStoreIndex.from_documents( | |
| documents, service_context=service_context | |
| ) | |
| # local write access | |
| index.storage_context.persist(file_path) | |
| # huggingface repo write access | |
| # with fs.open(file_path, "w") as file: | |
| # file.write(pickle.dumps(index)) | |
| return index | |