Spaces:

eoinmcg
/

AI_project

Build error

AI_project / utils.py

Eoin McGrath

ensures api_key can be loaded

91dc605 9 months ago

4.52 kB

	import os
	import csv
	import hashlib

	import chromadb
	from llama_index.core import Settings
	from llama_index.core import Document
	from llama_index.core.schema import BaseNode
	from llama_index.core.node_parser import TokenTextSplitter
	from llama_index.embeddings.openai import OpenAIEmbedding
	from llama_index.core.ingestion import IngestionPipeline
	from llama_index.core import VectorStoreIndex
	from llama_index.vector_stores.chroma import ChromaVectorStore
	from llama_index.embeddings.openai import OpenAIEmbedding
	from llama_index.llms.openai import OpenAI

	from config import CHROMA_PATH, CHROMA_COLLECTION, FILES, CHUNK_SIZE, CHUNK_OVERLAP


	api_key = os.getenv("OPENAI_API_KEY")

	Settings.llm = OpenAI(temperature=0, model="gpt-4o-mini", api_key=api_key)
	Settings.embed_model = OpenAIEmbedding(
	model="text-embedding-3-small"
	)

	def deterministic_id_func(i: int, doc: BaseNode) -> str:
	"""Deterministic ID function for the text splitter.
	This will be used to generate a unique repeatable identifier for each node."""
	unique_identifier = doc.id_ + str(i)
	hasher = hashlib.sha256()
	hasher.update(unique_identifier.encode('utf-8'))
	return hasher.hexdigest()

	def create_db(return_nodes=False):
	rows = []
	# Load the file as a JSON
	for FILE in FILES:
	with open(FILE, mode="r", encoding="utf-8") as file:
	csv_reader = csv.reader(file)

	for idx, row in enumerate(csv_reader):
	if idx == 0: continue # Skip header row
	rows.append(row)

	# Convert the chunks to Document objects so the LlamaIndex framework can process them.
	documents = [Document(text=row[1], metadata={"title": row[0], "url": row[2]}) for row in rows]
	# By default, the node/chunks ids are set to random uuids. To ensure same id's per run, we manually set them.
	for idx, doc in enumerate(documents):
	doc.id_ = f"doc_{idx}"

	# Define the splitter object that split the text into segments with 512 tokens,
	# with a 128 overlap between the segments.
	text_splitter = TokenTextSplitter(
	separator=" ", chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP,
	id_func=deterministic_id_func
	)

	chroma_client = chromadb.PersistentClient(path=CHROMA_PATH)
	chroma_collection = chroma_client.get_or_create_collection(CHROMA_COLLECTION)
	vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
	# Create the pipeline to apply the transformation (splitting and embedding) on each chunk,
	# and store the transformed text in the chroma vector store.
	pipeline = IngestionPipeline(
	transformations=[
	text_splitter,
	OpenAIEmbedding(model = 'text-embedding-3-small'),
	],
	vector_store=vector_store
	)

	# Run the transformation pipeline.
	nodes = pipeline.run(documents=documents, show_progress=True)

	db = chromadb.PersistentClient(path=CHROMA_PATH)
	chroma_collection = db.get_or_create_collection(CHROMA_COLLECTION)
	vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

	index = VectorStoreIndex.from_vector_store(vector_store)
	if return_nodes:
	return nodes
	else:
	return index


	def load_db():
	chroma_client = chromadb.PersistentClient(CHROMA_PATH)
	chroma_collection = chroma_client.get_collection(CHROMA_COLLECTION)

	vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

	index = VectorStoreIndex.from_vector_store(
	vector_store=vector_store,
	show_progress=True,
	use_async=True,
	embed_model=Settings.embed_model
	)

	return index

	def load_asset(file):
	"""Load CSS from an external file"""
	if os.path.exists(file):
	with open(file, "r", encoding="utf-8") as f:
	return f.read()

	def num_tokens_from_messages(messages, model="gpt-4"):
	"""Return the number of tokens used by a list of messages."""
	try:
	encoding = tiktoken.encoding_for_model(model)
	except KeyError:
	encoding = tiktoken.get_encoding("cl100k_base")

	tokens_per_message = 3
	tokens_per_name = 1

	num_tokens = 0
	for message in messages:
	num_tokens += tokens_per_message
	for key, value in message.items():
	num_tokens += len(encoding.encode(value))
	if key == "name":
	num_tokens += tokens_per_name
	num_tokens += 3 # every reply is primed with <\|start\|>assistant<\|message\|>

	return num_tokens