elia-waefler commited on
Commit
98e89b3
·
1 Parent(s): c432fc9
DokumententypenkatalogKBOB.xlsx ADDED
Binary file (63.8 kB). View file
 
requirements.txt CHANGED
@@ -2,3 +2,4 @@ streamlit
2
  pandas
3
  sentence-transformers
4
  unstructured
 
 
2
  pandas
3
  sentence-transformers
4
  unstructured
5
+ openpyxl
setup_vecstore.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import json
3
+ from sentence_transformers import SentenceTransformer
4
+
5
+ # Load the Sentence Transformer model
6
+ def load_sentence_transformer():
7
+ return SentenceTransformer('all-MiniLM-L6-v2')
8
+
9
+ sentence_transformer = load_sentence_transformer()
10
+
11
+ # Load the Excel file
12
+ df = pd.read_excel('DokumententypenkatalogKBOB.xlsx', sheet_name="Dokumententypen")
13
+
14
+
15
+ # Initialize dictionaries for storing vectorized categories
16
+ main_categories = {}
17
+ categories = {}
18
+ subcategories = {}
19
+
20
+ # Process each row in the Excel file
21
+ for index, row in df.iterrows():
22
+ code = row['DTC']
23
+ type_ = row['TYPE']
24
+ disziplin = row['DISZIPLIN']
25
+ name = row['DOKUMENTTYP']
26
+ description = row['BESCHREIBUNG']
27
+ embedding_text = f"Dokumententyp: {name}: {description}"
28
+ mini_embedding = sentence_transformer.encode(embedding_text).tolist()
29
+
30
+ vector_data = {
31
+ "CODE": code,
32
+ "Type": type_,
33
+ "Disziplin": disziplin,
34
+ "Name": name,
35
+ "Beschreibung": description,
36
+ "miniVec": mini_embedding,
37
+ "openaiVec": "PLACEHOLDER"
38
+ }
39
+
40
+ if len(code) == 1:
41
+ # Main category
42
+ main_categories[code] = vector_data
43
+ elif len(code) == 3:
44
+ # Category
45
+ main_letter = code[0]
46
+ if main_letter not in categories:
47
+ categories[main_letter] = []
48
+ categories[main_letter].append(vector_data)
49
+ elif len(code) == 6:
50
+ # Subcategory
51
+ category_code = code[:3]
52
+ if category_code not in subcategories:
53
+ subcategories[category_code] = []
54
+ subcategories[category_code].append(vector_data)
55
+ else:
56
+ print(f"FALSCHE KATEGORIE IN {index}, {row}!!")
57
+
58
+ # Save main categories to a JSON file
59
+ with open('vectorstore//main_categories.json', 'w') as f:
60
+ json.dump(list(main_categories.values()), f, indent=4)
61
+
62
+ # Save categories to JSON files
63
+ for main_letter, vectors in categories.items():
64
+ with open(f'vectorstore//{main_letter}_categories.json', 'w') as f:
65
+ json.dump(vectors, f, indent=4)
66
+
67
+ # Save subcategories to JSON files
68
+ for category_code, vectors in subcategories.items():
69
+ with open(f'vectorstore//{category_code}_subcategories.json', 'w') as f:
70
+ json.dump(vectors, f, indent=4)
71
+
72
+ print("Vector store generation complete.")
vectorstore/Organisation.json DELETED
File without changes