AI_project / utils.py
Eoin McGrath
ensures api_key can be loaded
91dc605
import os
import csv
import hashlib
import chromadb
from llama_index.core import Settings
from llama_index.core import Document
from llama_index.core.schema import BaseNode
from llama_index.core.node_parser import TokenTextSplitter
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core import VectorStoreIndex
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI
from config import CHROMA_PATH, CHROMA_COLLECTION, FILES, CHUNK_SIZE, CHUNK_OVERLAP
api_key = os.getenv("OPENAI_API_KEY")
Settings.llm = OpenAI(temperature=0, model="gpt-4o-mini", api_key=api_key)
Settings.embed_model = OpenAIEmbedding(
model="text-embedding-3-small"
)
def deterministic_id_func(i: int, doc: BaseNode) -> str:
"""Deterministic ID function for the text splitter.
This will be used to generate a unique repeatable identifier for each node."""
unique_identifier = doc.id_ + str(i)
hasher = hashlib.sha256()
hasher.update(unique_identifier.encode('utf-8'))
return hasher.hexdigest()
def create_db(return_nodes=False):
rows = []
# Load the file as a JSON
for FILE in FILES:
with open(FILE, mode="r", encoding="utf-8") as file:
csv_reader = csv.reader(file)
for idx, row in enumerate(csv_reader):
if idx == 0: continue # Skip header row
rows.append(row)
# Convert the chunks to Document objects so the LlamaIndex framework can process them.
documents = [Document(text=row[1], metadata={"title": row[0], "url": row[2]}) for row in rows]
# By default, the node/chunks ids are set to random uuids. To ensure same id's per run, we manually set them.
for idx, doc in enumerate(documents):
doc.id_ = f"doc_{idx}"
# Define the splitter object that split the text into segments with 512 tokens,
# with a 128 overlap between the segments.
text_splitter = TokenTextSplitter(
separator=" ", chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP,
id_func=deterministic_id_func
)
chroma_client = chromadb.PersistentClient(path=CHROMA_PATH)
chroma_collection = chroma_client.get_or_create_collection(CHROMA_COLLECTION)
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
# Create the pipeline to apply the transformation (splitting and embedding) on each chunk,
# and store the transformed text in the chroma vector store.
pipeline = IngestionPipeline(
transformations=[
text_splitter,
OpenAIEmbedding(model = 'text-embedding-3-small'),
],
vector_store=vector_store
)
# Run the transformation pipeline.
nodes = pipeline.run(documents=documents, show_progress=True)
db = chromadb.PersistentClient(path=CHROMA_PATH)
chroma_collection = db.get_or_create_collection(CHROMA_COLLECTION)
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
index = VectorStoreIndex.from_vector_store(vector_store)
if return_nodes:
return nodes
else:
return index
def load_db():
chroma_client = chromadb.PersistentClient(CHROMA_PATH)
chroma_collection = chroma_client.get_collection(CHROMA_COLLECTION)
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
index = VectorStoreIndex.from_vector_store(
vector_store=vector_store,
show_progress=True,
use_async=True,
embed_model=Settings.embed_model
)
return index
def load_asset(file):
"""Load CSS from an external file"""
if os.path.exists(file):
with open(file, "r", encoding="utf-8") as f:
return f.read()
def num_tokens_from_messages(messages, model="gpt-4"):
"""Return the number of tokens used by a list of messages."""
try:
encoding = tiktoken.encoding_for_model(model)
except KeyError:
encoding = tiktoken.get_encoding("cl100k_base")
tokens_per_message = 3
tokens_per_name = 1
num_tokens = 0
for message in messages:
num_tokens += tokens_per_message
for key, value in message.items():
num_tokens += len(encoding.encode(value))
if key == "name":
num_tokens += tokens_per_name
num_tokens += 3 # every reply is primed with <|start|>assistant<|message|>
return num_tokens