Spaces:

elia-waefler
/

classify-KBOB

Sleeping

File size: 2,268 Bytes

98e89b3

import pandas as pd
import json
from sentence_transformers import SentenceTransformer

# Load the Sentence Transformer model
def load_sentence_transformer():
    return SentenceTransformer('all-MiniLM-L6-v2')

sentence_transformer = load_sentence_transformer()

# Load the Excel file
df = pd.read_excel('DokumententypenkatalogKBOB.xlsx', sheet_name="Dokumententypen")


# Initialize dictionaries for storing vectorized categories
main_categories = {}
categories = {}
subcategories = {}

# Process each row in the Excel file
for index, row in df.iterrows():
    code = row['DTC']
    type_ = row['TYPE']
    disziplin = row['DISZIPLIN']
    name = row['DOKUMENTTYP']
    description = row['BESCHREIBUNG']
    embedding_text = f"Dokumententyp: {name}: {description}"
    mini_embedding = sentence_transformer.encode(embedding_text).tolist()

    vector_data = {
        "CODE": code,
        "Type": type_,
        "Disziplin": disziplin,
        "Name": name,
        "Beschreibung": description,
        "miniVec": mini_embedding,
        "openaiVec": "PLACEHOLDER"
    }

    if len(code) == 1:
        # Main category
        main_categories[code] = vector_data
    elif len(code) == 3:
        # Category
        main_letter = code[0]
        if main_letter not in categories:
            categories[main_letter] = []
        categories[main_letter].append(vector_data)
    elif len(code) == 6:
        # Subcategory
        category_code = code[:3]
        if category_code not in subcategories:
            subcategories[category_code] = []
        subcategories[category_code].append(vector_data)
    else:
        print(f"FALSCHE KATEGORIE IN {index}, {row}!!")

# Save main categories to a JSON file
with open('vectorstore//main_categories.json', 'w') as f:
    json.dump(list(main_categories.values()), f, indent=4)

# Save categories to JSON files
for main_letter, vectors in categories.items():
    with open(f'vectorstore//{main_letter}_categories.json', 'w') as f:
        json.dump(vectors, f, indent=4)

# Save subcategories to JSON files
for category_code, vectors in subcategories.items():
    with open(f'vectorstore//{category_code}_subcategories.json', 'w') as f:
        json.dump(vectors, f, indent=4)

print("Vector store generation complete.")