Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import json | |
| from sentence_transformers import SentenceTransformer | |
| # Load the Sentence Transformer model | |
| def load_sentence_transformer(): | |
| return SentenceTransformer('all-MiniLM-L6-v2') | |
| sentence_transformer = load_sentence_transformer() | |
| # Load the Excel file | |
| df = pd.read_excel('DokumententypenkatalogKBOB.xlsx', sheet_name="Dokumententypen") | |
| # Initialize dictionaries for storing vectorized categories | |
| main_categories = {} | |
| categories = {} | |
| subcategories = {} | |
| # Process each row in the Excel file | |
| for index, row in df.iterrows(): | |
| code = row['DTC'] | |
| type_ = row['TYPE'] | |
| disziplin = row['DISZIPLIN'] | |
| name = row['DOKUMENTTYP'] | |
| description = row['BESCHREIBUNG'] | |
| embedding_text = f"Dokumententyp: {name}: {description}" | |
| mini_embedding = sentence_transformer.encode(embedding_text).tolist() | |
| vector_data = { | |
| "CODE": code, | |
| "Type": type_, | |
| "Disziplin": disziplin, | |
| "Name": name, | |
| "Beschreibung": description, | |
| "miniVec": mini_embedding, | |
| "openaiVec": "PLACEHOLDER" | |
| } | |
| if len(code) == 1: | |
| # Main category | |
| main_categories[code] = vector_data | |
| elif len(code) == 3: | |
| # Category | |
| main_letter = code[0] | |
| if main_letter not in categories: | |
| categories[main_letter] = [] | |
| categories[main_letter].append(vector_data) | |
| elif len(code) == 6: | |
| # Subcategory | |
| category_code = code[:3] | |
| if category_code not in subcategories: | |
| subcategories[category_code] = [] | |
| subcategories[category_code].append(vector_data) | |
| else: | |
| print(f"FALSCHE KATEGORIE IN {index}, {row}!!") | |
| # Save main categories to a JSON file | |
| with open('vectorstore//main_categories.json', 'w') as f: | |
| json.dump(list(main_categories.values()), f, indent=4) | |
| # Save categories to JSON files | |
| for main_letter, vectors in categories.items(): | |
| with open(f'vectorstore//{main_letter}_categories.json', 'w') as f: | |
| json.dump(vectors, f, indent=4) | |
| # Save subcategories to JSON files | |
| for category_code, vectors in subcategories.items(): | |
| with open(f'vectorstore//{category_code}_subcategories.json', 'w') as f: | |
| json.dump(vectors, f, indent=4) | |
| print("Vector store generation complete.") | |