Spaces:

zhtet
/

RegBotBeta

Sleeping

App Files Files Community

Zwea Htet commited on Jun 16, 2023

Commit

9a2650e

1 Parent(s): 38bc9e2

added file system for huggingface and object serialization

Browse files

Files changed (3) hide show

.gitignore +2 -1
models/bloom.py +45 -16
requirements.txt +3 -1

.gitignore CHANGED Viewed

@@ -3,4 +3,5 @@ data/__pycache__
 models/__pycache__
 .env
 __pycache__
-vectorStores

 models/__pycache__
 .env
 __pycache__
+vectorStores
+.vscode

models/bloom.py CHANGED Viewed

@@ -1,22 +1,31 @@
 import os
 from json import dumps, loads
 import numpy as np
 import openai
 import pandas as pd
 from dotenv import load_dotenv
-from llama_index import (Document, GPTVectorStoreIndex, LLMPredictor,
-                         PromptHelper, ServiceContext, StorageContext,
-                         load_index_from_storage)
 from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
 from utils.customLLM import CustomLLM
 load_dotenv()
 openai.api_key = os.getenv("OPENAI_API_KEY")
 # get model
-# model_name = "bigscience/bloom-560m"
 # tokenizer = AutoTokenizer.from_pretrained(model_name)
 # model = AutoModelForCausalLM.from_pretrained(model_name, config='T5Config')
@@ -44,35 +53,55 @@ prompt_helper = PromptHelper(context_window, num_output, chunk_overlap_ratio)
 # define llm
 llm_predictor = LLMPredictor(llm=CustomLLM())
-service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, prompt_helper=prompt_helper)
-def prepare_data(file_path:str):
     df = pd.read_json(file_path)
-    df = df.replace(to_replace="", value=np.nan).dropna(axis=0) # remove null values
     parsed = loads(df.to_json(orient="records"))
     documents = []
     for item in parsed:
-        document = Document(item['paragraphText'],
-                            item['_id']['$oid'],
-                            extra_info={"chapter": item['chapter'],
-                                        "article": item['article'],
-                                        "title": item['title']})
         documents.append(document)
     return documents
 def initialize_index(index_name):
     file_path = f"./vectorStores/{index_name}"
     if os.path.exists(file_path):
         # rebuild storage context
         storage_context = StorageContext.from_defaults(persist_dir=file_path)
-        # load index
         index = load_index_from_storage(storage_context)
         return index
     else:
         documents = prepare_data(r"./assets/regItems.json")
-        index = GPTVectorStoreIndex.from_documents(documents, service_context=service_context)
         index.storage_context.persist(file_path)
-        return index

 import os
+import pickle
 from json import dumps, loads
 import numpy as np
 import openai
 import pandas as pd
 from dotenv import load_dotenv
+from huggingface_hub import HfFileSystem
+from llama_index import (
+    Document,
+    GPTVectorStoreIndex,
+    LLMPredictor,
+    PromptHelper,
+    ServiceContext,
+    StorageContext,
+    load_index_from_storage,
+)
 from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
 from utils.customLLM import CustomLLM
 load_dotenv()
 openai.api_key = os.getenv("OPENAI_API_KEY")
+fs = HfFileSystem()
 # get model
+# model_name = "bigscience/bloom-560m"
 # tokenizer = AutoTokenizer.from_pretrained(model_name)
 # model = AutoModelForCausalLM.from_pretrained(model_name, config='T5Config')
 # define llm
 llm_predictor = LLMPredictor(llm=CustomLLM())
+service_context = ServiceContext.from_defaults(
+    llm_predictor=llm_predictor, prompt_helper=prompt_helper
+)
+def prepare_data(file_path: str):
     df = pd.read_json(file_path)
+    df = df.replace(to_replace="", value=np.nan).dropna(axis=0)  # remove null values
     parsed = loads(df.to_json(orient="records"))
     documents = []
     for item in parsed:
+        document = Document(
+            item["paragraphText"],
+            item["_id"]["$oid"],
+            extra_info={
+                "chapter": item["chapter"],
+                "article": item["article"],
+                "title": item["title"],
+            },
+        )
         documents.append(document)
     return documents
 def initialize_index(index_name):
     file_path = f"./vectorStores/{index_name}"
     if os.path.exists(file_path):
         # rebuild storage context
         storage_context = StorageContext.from_defaults(persist_dir=file_path)
+        # local load index access
         index = load_index_from_storage(storage_context)
+        # huggingface repo load access
+        with fs.open(file_path, "r") as file:
+            index = pickle.loads(file.readlines())
         return index
     else:
         documents = prepare_data(r"./assets/regItems.json")
+        index = GPTVectorStoreIndex.from_documents(
+            documents, service_context=service_context
+        )
+        # local write access
         index.storage_context.persist(file_path)
+        # huggingface repo write access
+        with fs.open(file_path, "w") as file:
+            file.write(pickle.dumps(index))
+        return index

requirements.txt CHANGED Viewed

@@ -8,4 +8,6 @@ openai
 faiss-cpu
 python-dotenv
 streamlit
-streamlit-chat

 faiss-cpu
 python-dotenv
 streamlit
+streamlit-chat
+huggingface_hub
+pickle5