wicaksonolm2 commited on
Commit
4d037b0
·
1 Parent(s): 16879a1

[29.06.25] wicaksono-tmr | ✨ feat : ""

Browse files
Files changed (2) hide show
  1. src/RAG.py +0 -1
  2. src/vectorization.py +392 -0
src/RAG.py CHANGED
@@ -12,7 +12,6 @@ from langchain.vectorstores import Chroma
12
  from langchain.chat_models import ChatOpenAI
13
  from langchain.prompts import PromptTemplate
14
  from collections import defaultdict
15
-
16
  from vectorization import LangChainMultimodalVectorizer
17
  from year_parser import YearParser
18
  from config import *
 
12
  from langchain.chat_models import ChatOpenAI
13
  from langchain.prompts import PromptTemplate
14
  from collections import defaultdict
 
15
  from vectorization import LangChainMultimodalVectorizer
16
  from year_parser import YearParser
17
  from config import *
src/vectorization.py ADDED
@@ -0,0 +1,392 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ from typing import List, Dict, Any, Optional
4
+ from datetime import datetime
5
+ from dotenv import load_dotenv
6
+ from pathlib import Path
7
+
8
+ # LangChain imports
9
+ from langchain.embeddings import OpenAIEmbeddings
10
+ from langchain.vectorstores import Chroma
11
+ from langchain.schema import Document
12
+ load_dotenv()
13
+
14
+
15
+ class LangChainMultimodalVectorizer:
16
+ def __init__(self):
17
+ self.embeddings = OpenAIEmbeddings(
18
+ openai_api_key=os.getenv("OPENAI_API_KEY"),
19
+ model=os.getenv("OPENAI_EMBEDDING_MODEL", "text-embedding-ada-002")
20
+ )
21
+ self.persist_dir = os.getenv("CHROMA_PERSIST_DIR", "./chroma_persist")
22
+
23
+ def get_or_create_vectorstore(self, year: int) -> Chroma:
24
+ """Get or create Chroma vectorstore for specific year"""
25
+ collection_name = f"optima_multimodal_{year}"
26
+
27
+ # Create persist directory for this year
28
+ year_persist_dir = os.path.join(self.persist_dir, f"year_{year}")
29
+ os.makedirs(year_persist_dir, exist_ok=True)
30
+
31
+ try:
32
+ # Try to load existing vectorstore
33
+ vectorstore = Chroma(
34
+ collection_name=collection_name,
35
+ embedding_function=self.embeddings,
36
+ persist_directory=year_persist_dir
37
+ )
38
+
39
+ # Check if collection exists and has documents
40
+ if vectorstore._collection.count() > 0:
41
+ print(f"📚 Using existing vectorstore: {collection_name} ({vectorstore._collection.count()} docs)")
42
+ else:
43
+ print(f"🆕 Created new vectorstore: {collection_name}")
44
+
45
+ except Exception as e:
46
+ print(f"🆕 Creating new vectorstore: {collection_name}")
47
+ vectorstore = Chroma(
48
+ collection_name=collection_name,
49
+ embedding_function=self.embeddings,
50
+ persist_directory=year_persist_dir
51
+ )
52
+
53
+ return vectorstore
54
+
55
+ def create_embedding_text(self, item: Dict[str, Any]) -> str:
56
+ """Create optimized text for embedding based on content_type"""
57
+ content_type = item.get("content_type", "")
58
+ content = item.get("content", "")
59
+ context_text = item.get("context_text", "")
60
+
61
+ # Create rich embedding text based on content_type
62
+ if content_type == "silabus":
63
+ mata_kuliah = item.get("mata_kuliah", "")
64
+ course_code = item.get("course_code", "")
65
+ silabus_type = item.get("silabus_type", "")
66
+ program = item.get("program", "")
67
+ semester = item.get("semester", "")
68
+
69
+ embedding_text = f"Silabus {program} semester {semester} {mata_kuliah} {course_code} {silabus_type}: {content} {context_text}"
70
+
71
+ elif content_type == "curriculum":
72
+ program = item.get("program", "")
73
+ semester = item.get("semester", "")
74
+ table_type = item.get("table_type", "")
75
+
76
+ embedding_text = f"Kurikulum {program} semester {semester} {table_type}: {content} {context_text}"
77
+
78
+ elif content_type == "image":
79
+ title = item.get("title", "")
80
+ caption = item.get("caption", "")
81
+
82
+ embedding_text = f"Gambar: {title} {caption} {content} {context_text}"
83
+
84
+ elif content_type == "table":
85
+ title = item.get("title", "")
86
+ caption = item.get("caption", "")
87
+ rows = item.get("rows", 0)
88
+ cols = item.get("cols", 0)
89
+
90
+ embedding_text = f"Tabel {rows}x{cols}: {title} {caption} {content} {context_text}"
91
+
92
+ else: # text_chunk
93
+ chapter = item.get("chapter", "")
94
+ section = item.get("section", "")
95
+
96
+ embedding_text = f"Teks {chapter} {section}: {content} {context_text}"
97
+
98
+ return embedding_text
99
+
100
+ def prepare_document_metadata(self, item: Dict[str, Any]) -> Dict[str, Any]:
101
+ """Prepare metadata for LangChain Document"""
102
+ content_type = item.get("content_type", "")
103
+
104
+ # Base metadata (common for all types)
105
+ metadata = {
106
+ "id": item.get("id", ""),
107
+ "content_type": content_type,
108
+ "year": item.get("year", 0),
109
+ "page": item.get("page", 0),
110
+ "filename": item.get("filename", "")[:200],
111
+ "filepath": item.get("filepath", "")[:300],
112
+ "extracted_at": item.get("extracted_at", "")
113
+ }
114
+
115
+ # Add specific metadata based on content_type
116
+ if content_type == "silabus":
117
+ metadata.update({
118
+ "mata_kuliah": item.get("mata_kuliah", "")[:200],
119
+ "course_code": item.get("course_code", ""),
120
+ "sks": item.get("sks", ""),
121
+ "program": item.get("program", ""),
122
+ "semester": item.get("semester", ""),
123
+ "silabus_type": item.get("silabus_type", "")
124
+ })
125
+
126
+ elif content_type == "curriculum":
127
+ metadata.update({
128
+ "program": item.get("program", ""),
129
+ "semester": item.get("semester", ""),
130
+ "table_type": item.get("table_type", ""),
131
+ "content_type_detail": item.get("content_type_detail", ""),
132
+ "rows_count": item.get("rows_count", 0)
133
+ })
134
+
135
+ elif content_type == "image":
136
+ metadata.update({
137
+ "title": item.get("title", "")[:200],
138
+ "caption": item.get("caption", "")[:300],
139
+ "image_index": item.get("image_index", 0),
140
+ "image_path": item.get("filepath", "")
141
+ })
142
+
143
+ elif content_type == "table":
144
+ metadata.update({
145
+ "title": item.get("title", "")[:200],
146
+ "caption": item.get("caption", "")[:300],
147
+ "table_index": item.get("table_index", 0),
148
+ "rows": item.get("rows", 0),
149
+ "cols": item.get("cols", 0),
150
+ "table_path": item.get("filepath", "")
151
+ })
152
+
153
+ else: # text_chunk
154
+ metadata.update({
155
+ "chapter": item.get("chapter", "")[:200],
156
+ "section": item.get("section", "")[:200],
157
+ "subsection": item.get("subsection", "")[:200],
158
+ "chunk_type": item.get("chunk_type", ""),
159
+ "quality_score": item.get("quality_score", 0.0)
160
+ })
161
+
162
+ return metadata
163
+
164
+ def process_unified_json(self, json_file_path: str, year: int) -> Dict[str, int]:
165
+ """Process unified multimodal JSON file using LangChain"""
166
+
167
+ if not os.path.exists(json_file_path):
168
+ print(f"❌ File not found: {json_file_path}")
169
+ return {}
170
+
171
+ print(f"🔄 Processing: {json_file_path}")
172
+
173
+ with open(json_file_path, 'r', encoding='utf-8') as f:
174
+ raw_data = json.load(f)
175
+
176
+ # 🔧 Handle different JSON structures
177
+ if isinstance(raw_data, dict):
178
+ if 'content' in raw_data:
179
+ data = raw_data['content'] # Extract from content array
180
+ print(f"📦 Detected structured JSON with 'content' key")
181
+ else:
182
+ print(f"❌ Unexpected JSON structure: {list(raw_data.keys())}")
183
+ return {}
184
+ elif isinstance(raw_data, list):
185
+ data = raw_data # Direct array
186
+ print(f"📦 Detected direct array JSON")
187
+ else:
188
+ print(f"❌ Unexpected JSON type: {type(raw_data)}")
189
+ return {}
190
+
191
+ # Get vectorstore for this year
192
+ vectorstore = self.get_or_create_vectorstore(year)
193
+
194
+ # Statistics
195
+ stats = {
196
+ "text_chunk": 0,
197
+ "image": 0,
198
+ "table": 0,
199
+ "curriculum": 0,
200
+ "silabus": 0,
201
+ "total": 0,
202
+ "errors": 0,
203
+ "skipped": 0
204
+ }
205
+
206
+ print(f"📊 Found {len(data)} items for year {year}")
207
+
208
+ # Prepare documents for batch processing
209
+ documents = []
210
+ batch_size = 50
211
+
212
+ for idx, item in enumerate(data):
213
+ try:
214
+ # 🔧 Ensure item is dict
215
+ if not isinstance(item, dict):
216
+ print(f"⚠️ Skipping non-dict item at index {idx}: {type(item)}")
217
+ stats["skipped"] += 1
218
+ continue
219
+
220
+ content_type = item.get("content_type", "unknown")
221
+ content = item.get("content", "")
222
+ context_text = item.get("context_text", "")
223
+
224
+ # Skip if no meaningful content
225
+ if not content and not context_text:
226
+ stats["skipped"] += 1
227
+ continue
228
+
229
+ if len(str(content).strip()) < 3 and len(str(context_text).strip()) < 10:
230
+ stats["skipped"] += 1
231
+ continue
232
+
233
+ # Create embedding text
234
+ embedding_text = self.create_embedding_text(item)
235
+
236
+ # Prepare metadata
237
+ metadata = self.prepare_document_metadata(item)
238
+
239
+ # Create LangChain Document
240
+ doc = Document(
241
+ page_content=embedding_text,
242
+ metadata=metadata
243
+ )
244
+
245
+ documents.append(doc)
246
+
247
+ # Update stats
248
+ if content_type in stats:
249
+ stats[content_type] += 1
250
+ else:
251
+ stats["unknown"] = stats.get("unknown", 0) + 1
252
+ stats["total"] += 1
253
+
254
+ # Process batch when full
255
+ if len(documents) >= batch_size:
256
+ self.add_documents_to_vectorstore(vectorstore, documents)
257
+ print(f" ✅ Processed batch {stats['total']//batch_size} ({stats['total']} items)")
258
+ documents = [] # Reset batch
259
+
260
+ except Exception as e:
261
+ print(f"❌ Error processing item {idx}: {e}")
262
+ print(f" Item type: {type(item)}")
263
+ if isinstance(item, dict):
264
+ print(f" Item keys: {list(item.keys())[:5]}...")
265
+ else:
266
+ print(f" Item content preview: {str(item)[:100]}...")
267
+ stats["errors"] += 1
268
+
269
+ # Process remaining documents
270
+ if documents:
271
+ self.add_documents_to_vectorstore(vectorstore, documents)
272
+
273
+ # Persist the vectorstore
274
+ vectorstore.persist()
275
+
276
+ print(f"📊 Processing complete for year {year}:")
277
+ for key, value in stats.items():
278
+ if value > 0:
279
+ print(f" 📝 {key}: {value}")
280
+
281
+ return stats
282
+
283
+ def add_documents_to_vectorstore(self, vectorstore: Chroma, documents: List[Document]):
284
+ """Add documents to vectorstore"""
285
+ try:
286
+ vectorstore.add_documents(documents)
287
+ except Exception as e:
288
+ print(f"❌ Error adding documents to vectorstore: {e}")
289
+
290
+ def query_multimodal(self, query_text: str, year: Optional[int] = None,
291
+ content_types: Optional[List[str]] = None,
292
+ n_results: int = 10) -> List[Dict]:
293
+ results = []
294
+ years_to_search = [year] if year else [2022, 2023, 2024]
295
+
296
+ for search_year in years_to_search:
297
+ try:
298
+ vectorstore = self.get_or_create_vectorstore(search_year)
299
+
300
+ # Build filter for content types
301
+ search_kwargs = {"k": n_results}
302
+ if content_types:
303
+ search_kwargs["filter"] = {"content_type": {"$in": content_types}}
304
+
305
+ # Perform similarity search
306
+ docs = vectorstore.similarity_search_with_score(
307
+ query_text,
308
+ k=n_results,
309
+ filter=search_kwargs.get("filter")
310
+ )
311
+
312
+ # Format results
313
+ for doc, score in docs:
314
+ result = {
315
+ "content": doc.page_content,
316
+ "metadata": doc.metadata,
317
+ "score": score,
318
+ "year": search_year
319
+ }
320
+
321
+ # Add special handling for images
322
+ if result["metadata"]["content_type"] == "image":
323
+ result["image_path"] = result["metadata"].get("image_path", "")
324
+ result["retrievable"] = os.path.exists(result["image_path"]) if result["image_path"] else False
325
+
326
+ # Add special handling for tables
327
+ elif result["metadata"]["content_type"] == "table":
328
+ result["table_path"] = result["metadata"].get("table_path", "")
329
+ result["retrievable"] = os.path.exists(result["table_path"]) if result["table_path"] else False
330
+
331
+ results.append(result)
332
+
333
+ except Exception as e:
334
+ print(f"❌ Error querying year {search_year}: {e}")
335
+
336
+ # Sort by score (lower is better for distance-based scoring)
337
+ results.sort(key=lambda x: x["score"])
338
+ return results[:n_results]
339
+
340
+ def get_vectorstore_stats(self, year: int) -> Dict:
341
+ """Get statistics for a vectorstore"""
342
+ try:
343
+ vectorstore = self.get_or_create_vectorstore(year)
344
+ count = vectorstore._collection.count()
345
+
346
+ return {
347
+ "year": year,
348
+ "total_documents": count,
349
+ "collection_name": f"optima_multimodal_{year}"
350
+ }
351
+ except Exception as e:
352
+ print(f"❌ Error getting stats for year {year}: {e}")
353
+ return {"year": year, "total_documents": 0, "error": str(e)}
354
+
355
+
356
+ def process_all_unified_files(data_dir: str = "./chunked"):
357
+ vectorizer = LangChainMultimodalVectorizer()
358
+ years = [2022, 2023, 2024]
359
+ total_stats = {"total": 0, "errors": 0}
360
+
361
+ for year in years:
362
+ json_file = os.path.join(data_dir, f"multimodal_unified_{year}.json")
363
+
364
+ if not os.path.exists(json_file):
365
+ print(f"⚠️ File not found: {json_file}")
366
+ continue
367
+
368
+ print(f"\n🔄 Processing year {year}...")
369
+
370
+ stats = vectorizer.process_unified_json(json_file, year)
371
+
372
+ if stats:
373
+ print(f"📊 Year {year} Final Statistics:")
374
+ for content_type, count in stats.items():
375
+ print(f" 📝 {content_type}: {count}")
376
+
377
+ total_stats["total"] += stats.get("total", 0)
378
+ total_stats["errors"] += stats.get("errors", 0)
379
+
380
+ print(f"\n🎉 FINAL PROCESSING SUMMARY:")
381
+ print(f" 🎯 Total documents processed: {total_stats['total']}")
382
+ print(f" ❌ Total errors: {total_stats['errors']}")
383
+
384
+ # Show vectorstore stats
385
+ print(f"\n📚 VECTORSTORE STATISTICS:")
386
+ for year in years:
387
+ stats = vectorizer.get_vectorstore_stats(year)
388
+ print(f" {year}: {stats['total_documents']} documents")
389
+
390
+
391
+ if __name__ == "__main__":
392
+ process_all_unified_files()