classify-KBOB / setup_vecstore.py
elia-waefler's picture
init
98e89b3
import pandas as pd
import json
from sentence_transformers import SentenceTransformer
# Load the Sentence Transformer model
def load_sentence_transformer():
return SentenceTransformer('all-MiniLM-L6-v2')
sentence_transformer = load_sentence_transformer()
# Load the Excel file
df = pd.read_excel('DokumententypenkatalogKBOB.xlsx', sheet_name="Dokumententypen")
# Initialize dictionaries for storing vectorized categories
main_categories = {}
categories = {}
subcategories = {}
# Process each row in the Excel file
for index, row in df.iterrows():
code = row['DTC']
type_ = row['TYPE']
disziplin = row['DISZIPLIN']
name = row['DOKUMENTTYP']
description = row['BESCHREIBUNG']
embedding_text = f"Dokumententyp: {name}: {description}"
mini_embedding = sentence_transformer.encode(embedding_text).tolist()
vector_data = {
"CODE": code,
"Type": type_,
"Disziplin": disziplin,
"Name": name,
"Beschreibung": description,
"miniVec": mini_embedding,
"openaiVec": "PLACEHOLDER"
}
if len(code) == 1:
# Main category
main_categories[code] = vector_data
elif len(code) == 3:
# Category
main_letter = code[0]
if main_letter not in categories:
categories[main_letter] = []
categories[main_letter].append(vector_data)
elif len(code) == 6:
# Subcategory
category_code = code[:3]
if category_code not in subcategories:
subcategories[category_code] = []
subcategories[category_code].append(vector_data)
else:
print(f"FALSCHE KATEGORIE IN {index}, {row}!!")
# Save main categories to a JSON file
with open('vectorstore//main_categories.json', 'w') as f:
json.dump(list(main_categories.values()), f, indent=4)
# Save categories to JSON files
for main_letter, vectors in categories.items():
with open(f'vectorstore//{main_letter}_categories.json', 'w') as f:
json.dump(vectors, f, indent=4)
# Save subcategories to JSON files
for category_code, vectors in subcategories.items():
with open(f'vectorstore//{category_code}_subcategories.json', 'w') as f:
json.dump(vectors, f, indent=4)
print("Vector store generation complete.")