Spaces:
Sleeping
Sleeping
Commit ·
98e89b3
1
Parent(s): c432fc9
init
Browse files- DokumententypenkatalogKBOB.xlsx +0 -0
- requirements.txt +1 -0
- setup_vecstore.py +72 -0
- vectorstore/Organisation.json +0 -0
DokumententypenkatalogKBOB.xlsx
ADDED
|
Binary file (63.8 kB). View file
|
|
|
requirements.txt
CHANGED
|
@@ -2,3 +2,4 @@ streamlit
|
|
| 2 |
pandas
|
| 3 |
sentence-transformers
|
| 4 |
unstructured
|
|
|
|
|
|
| 2 |
pandas
|
| 3 |
sentence-transformers
|
| 4 |
unstructured
|
| 5 |
+
openpyxl
|
setup_vecstore.py
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import json
|
| 3 |
+
from sentence_transformers import SentenceTransformer
|
| 4 |
+
|
| 5 |
+
# Load the Sentence Transformer model
|
| 6 |
+
def load_sentence_transformer():
|
| 7 |
+
return SentenceTransformer('all-MiniLM-L6-v2')
|
| 8 |
+
|
| 9 |
+
sentence_transformer = load_sentence_transformer()
|
| 10 |
+
|
| 11 |
+
# Load the Excel file
|
| 12 |
+
df = pd.read_excel('DokumententypenkatalogKBOB.xlsx', sheet_name="Dokumententypen")
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
# Initialize dictionaries for storing vectorized categories
|
| 16 |
+
main_categories = {}
|
| 17 |
+
categories = {}
|
| 18 |
+
subcategories = {}
|
| 19 |
+
|
| 20 |
+
# Process each row in the Excel file
|
| 21 |
+
for index, row in df.iterrows():
|
| 22 |
+
code = row['DTC']
|
| 23 |
+
type_ = row['TYPE']
|
| 24 |
+
disziplin = row['DISZIPLIN']
|
| 25 |
+
name = row['DOKUMENTTYP']
|
| 26 |
+
description = row['BESCHREIBUNG']
|
| 27 |
+
embedding_text = f"Dokumententyp: {name}: {description}"
|
| 28 |
+
mini_embedding = sentence_transformer.encode(embedding_text).tolist()
|
| 29 |
+
|
| 30 |
+
vector_data = {
|
| 31 |
+
"CODE": code,
|
| 32 |
+
"Type": type_,
|
| 33 |
+
"Disziplin": disziplin,
|
| 34 |
+
"Name": name,
|
| 35 |
+
"Beschreibung": description,
|
| 36 |
+
"miniVec": mini_embedding,
|
| 37 |
+
"openaiVec": "PLACEHOLDER"
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
if len(code) == 1:
|
| 41 |
+
# Main category
|
| 42 |
+
main_categories[code] = vector_data
|
| 43 |
+
elif len(code) == 3:
|
| 44 |
+
# Category
|
| 45 |
+
main_letter = code[0]
|
| 46 |
+
if main_letter not in categories:
|
| 47 |
+
categories[main_letter] = []
|
| 48 |
+
categories[main_letter].append(vector_data)
|
| 49 |
+
elif len(code) == 6:
|
| 50 |
+
# Subcategory
|
| 51 |
+
category_code = code[:3]
|
| 52 |
+
if category_code not in subcategories:
|
| 53 |
+
subcategories[category_code] = []
|
| 54 |
+
subcategories[category_code].append(vector_data)
|
| 55 |
+
else:
|
| 56 |
+
print(f"FALSCHE KATEGORIE IN {index}, {row}!!")
|
| 57 |
+
|
| 58 |
+
# Save main categories to a JSON file
|
| 59 |
+
with open('vectorstore//main_categories.json', 'w') as f:
|
| 60 |
+
json.dump(list(main_categories.values()), f, indent=4)
|
| 61 |
+
|
| 62 |
+
# Save categories to JSON files
|
| 63 |
+
for main_letter, vectors in categories.items():
|
| 64 |
+
with open(f'vectorstore//{main_letter}_categories.json', 'w') as f:
|
| 65 |
+
json.dump(vectors, f, indent=4)
|
| 66 |
+
|
| 67 |
+
# Save subcategories to JSON files
|
| 68 |
+
for category_code, vectors in subcategories.items():
|
| 69 |
+
with open(f'vectorstore//{category_code}_subcategories.json', 'w') as f:
|
| 70 |
+
json.dump(vectors, f, indent=4)
|
| 71 |
+
|
| 72 |
+
print("Vector store generation complete.")
|
vectorstore/Organisation.json
DELETED
|
File without changes
|