Spaces:

elia-waefler
/

classify-KBOB

Sleeping

App Files Files Community

elia-waefler commited on May 28, 2024

Commit

98e89b3

1 Parent(s): c432fc9

init

Browse files

Files changed (4) hide show

DokumententypenkatalogKBOB.xlsx +0 -0
requirements.txt +1 -0
setup_vecstore.py +72 -0
vectorstore/Organisation.json +0 -0

DokumententypenkatalogKBOB.xlsx ADDED Viewed

Binary file (63.8 kB). View file

requirements.txt CHANGED Viewed

@@ -2,3 +2,4 @@ streamlit
 pandas
 sentence-transformers
 unstructured

 pandas
 sentence-transformers
 unstructured
+openpyxl

setup_vecstore.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import pandas as pd
+import json
+from sentence_transformers import SentenceTransformer
+# Load the Sentence Transformer model
+def load_sentence_transformer():
+    return SentenceTransformer('all-MiniLM-L6-v2')
+sentence_transformer = load_sentence_transformer()
+# Load the Excel file
+df = pd.read_excel('DokumententypenkatalogKBOB.xlsx', sheet_name="Dokumententypen")
+# Initialize dictionaries for storing vectorized categories
+main_categories = {}
+categories = {}
+subcategories = {}
+# Process each row in the Excel file
+for index, row in df.iterrows():
+    code = row['DTC']
+    type_ = row['TYPE']
+    disziplin = row['DISZIPLIN']
+    name = row['DOKUMENTTYP']
+    description = row['BESCHREIBUNG']
+    embedding_text = f"Dokumententyp: {name}: {description}"
+    mini_embedding = sentence_transformer.encode(embedding_text).tolist()
+    vector_data = {
+        "CODE": code,
+        "Type": type_,
+        "Disziplin": disziplin,
+        "Name": name,
+        "Beschreibung": description,
+        "miniVec": mini_embedding,
+        "openaiVec": "PLACEHOLDER"
+    }
+    if len(code) == 1:
+        # Main category
+        main_categories[code] = vector_data
+    elif len(code) == 3:
+        # Category
+        main_letter = code[0]
+        if main_letter not in categories:
+            categories[main_letter] = []
+        categories[main_letter].append(vector_data)
+    elif len(code) == 6:
+        # Subcategory
+        category_code = code[:3]
+        if category_code not in subcategories:
+            subcategories[category_code] = []
+        subcategories[category_code].append(vector_data)
+    else:
+        print(f"FALSCHE KATEGORIE IN {index}, {row}!!")
+# Save main categories to a JSON file
+with open('vectorstore//main_categories.json', 'w') as f:
+    json.dump(list(main_categories.values()), f, indent=4)
+# Save categories to JSON files
+for main_letter, vectors in categories.items():
+    with open(f'vectorstore//{main_letter}_categories.json', 'w') as f:
+        json.dump(vectors, f, indent=4)
+# Save subcategories to JSON files
+for category_code, vectors in subcategories.items():
+    with open(f'vectorstore//{category_code}_subcategories.json', 'w') as f:
+        json.dump(vectors, f, indent=4)
+print("Vector store generation complete.")

vectorstore/Organisation.json DELETED Viewed

File without changes