import pandas as pd import json from sentence_transformers import SentenceTransformer # Load the Sentence Transformer model def load_sentence_transformer(): return SentenceTransformer('all-MiniLM-L6-v2') sentence_transformer = load_sentence_transformer() # Load the Excel file df = pd.read_excel('DokumententypenkatalogKBOB.xlsx', sheet_name="Dokumententypen") # Initialize dictionaries for storing vectorized categories main_categories = {} categories = {} subcategories = {} # Process each row in the Excel file for index, row in df.iterrows(): code = row['DTC'] type_ = row['TYPE'] disziplin = row['DISZIPLIN'] name = row['DOKUMENTTYP'] description = row['BESCHREIBUNG'] embedding_text = f"Dokumententyp: {name}: {description}" mini_embedding = sentence_transformer.encode(embedding_text).tolist() vector_data = { "CODE": code, "Type": type_, "Disziplin": disziplin, "Name": name, "Beschreibung": description, "miniVec": mini_embedding, "openaiVec": "PLACEHOLDER" } if len(code) == 1: # Main category main_categories[code] = vector_data elif len(code) == 3: # Category main_letter = code[0] if main_letter not in categories: categories[main_letter] = [] categories[main_letter].append(vector_data) elif len(code) == 6: # Subcategory category_code = code[:3] if category_code not in subcategories: subcategories[category_code] = [] subcategories[category_code].append(vector_data) else: print(f"FALSCHE KATEGORIE IN {index}, {row}!!") # Save main categories to a JSON file with open('vectorstore//main_categories.json', 'w') as f: json.dump(list(main_categories.values()), f, indent=4) # Save categories to JSON files for main_letter, vectors in categories.items(): with open(f'vectorstore//{main_letter}_categories.json', 'w') as f: json.dump(vectors, f, indent=4) # Save subcategories to JSON files for category_code, vectors in subcategories.items(): with open(f'vectorstore//{category_code}_subcategories.json', 'w') as f: json.dump(vectors, f, indent=4) print("Vector store generation complete.")