Spaces:

eoinmcg
/

AI_project

Build error

File size: 4,515 Bytes

import os
import csv
import hashlib

import chromadb
from llama_index.core import Settings
from llama_index.core import Document
from llama_index.core.schema import BaseNode
from llama_index.core.node_parser import TokenTextSplitter
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core import VectorStoreIndex
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI

from config import CHROMA_PATH, CHROMA_COLLECTION, FILES, CHUNK_SIZE, CHUNK_OVERLAP


api_key = os.getenv("OPENAI_API_KEY")

Settings.llm = OpenAI(temperature=0, model="gpt-4o-mini", api_key=api_key)
Settings.embed_model = OpenAIEmbedding(
    model="text-embedding-3-small"
)

def deterministic_id_func(i: int, doc: BaseNode) -> str:
    """Deterministic ID function for the text splitter.
    This will be used to generate a unique repeatable identifier for each node."""
    unique_identifier = doc.id_ + str(i)
    hasher = hashlib.sha256()
    hasher.update(unique_identifier.encode('utf-8')) 
    return hasher.hexdigest()

def create_db(return_nodes=False):
    rows = []
    # Load the file as a JSON
    for FILE in FILES:
        with open(FILE, mode="r", encoding="utf-8") as file:
            csv_reader = csv.reader(file)

            for idx, row in enumerate(csv_reader):
                if idx == 0: continue # Skip header row
                rows.append(row)

    # Convert the chunks to Document objects so the LlamaIndex framework can process them.
    documents = [Document(text=row[1], metadata={"title": row[0], "url": row[2]}) for row in rows]
    # By default, the node/chunks ids are set to random uuids. To ensure same id's per run, we manually set them.
    for idx, doc in enumerate(documents):
        doc.id_ = f"doc_{idx}"

    # Define the splitter object that split the text into segments with 512 tokens,
    # with a 128 overlap between the segments.
    text_splitter = TokenTextSplitter(
        separator=" ", chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP,
        id_func=deterministic_id_func
    )

    chroma_client = chromadb.PersistentClient(path=CHROMA_PATH)
    chroma_collection = chroma_client.get_or_create_collection(CHROMA_COLLECTION)
    vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
    # Create the pipeline to apply the transformation (splitting and embedding) on each chunk,
    # and store the transformed text in the chroma vector store.
    pipeline = IngestionPipeline(
        transformations=[
            text_splitter,
            OpenAIEmbedding(model = 'text-embedding-3-small'),
        ],
        vector_store=vector_store
    )

    # Run the transformation pipeline.
    nodes = pipeline.run(documents=documents, show_progress=True)

    db = chromadb.PersistentClient(path=CHROMA_PATH)
    chroma_collection = db.get_or_create_collection(CHROMA_COLLECTION)
    vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

    index = VectorStoreIndex.from_vector_store(vector_store)
    if return_nodes:
        return nodes
    else:
        return index


def load_db():
    chroma_client = chromadb.PersistentClient(CHROMA_PATH)
    chroma_collection = chroma_client.get_collection(CHROMA_COLLECTION)

    vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

    index = VectorStoreIndex.from_vector_store(
        vector_store=vector_store,
        show_progress=True,
        use_async=True,
        embed_model=Settings.embed_model
    )

    return index

def load_asset(file):
    """Load CSS from an external file"""
    if os.path.exists(file):
        with open(file, "r", encoding="utf-8") as f:
            return f.read()

def num_tokens_from_messages(messages, model="gpt-4"):
    """Return the number of tokens used by a list of messages."""
    try:
        encoding = tiktoken.encoding_for_model(model)
    except KeyError:
        encoding = tiktoken.get_encoding("cl100k_base")
    
    tokens_per_message = 3
    tokens_per_name = 1
    
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3  # every reply is primed with <|start|>assistant<|message|>
    
    return num_tokens