wicaksonolm2 commited on
Commit
1b060e0
Β·
1 Parent(s): bdc2b18

Untrack .py and .md from LFS and restore as regular files

Browse files
Files changed (6) hide show
  1. README.md +17 -3
  2. src/RAG.py +883 -3
  3. src/README.md +5 -3
  4. src/config.py +72 -3
  5. src/streamlit_app.py +521 -3
  6. src/year_parser.py +60 -3
README.md CHANGED
@@ -1,3 +1,17 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:970b32745a7011cfcc27e7f6909ce9e74f53bc721fe542032922ac286fe3ebe9
3
- size 227
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Optima
3
+ emoji: "πŸš€"
4
+ colorFrom: red
5
+ colorTo: red
6
+ sdk: docker
7
+ app_port: 8501
8
+ tags:
9
+ - streamlit
10
+ pinned: false
11
+ app_file: src/streamlit_app.py
12
+ short_description: Streamlit template space
13
+ ---
14
+
15
+ # Welcome to Optima
16
+
17
+ test
src/RAG.py CHANGED
@@ -1,3 +1,883 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:56756246b3d6938f172e35a01b2674525cef40dfba6e48fa33c4ceb96316c4a6
3
- size 38059
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import pandas as pd
4
+ from typing import List, Dict, Any, Optional, Tuple, Set
5
+ from datetime import datetime
6
+ from dotenv import load_dotenv
7
+ import numpy as np
8
+ from sklearn.metrics.pairwise import cosine_similarity
9
+ import re
10
+ from langchain.embeddings import OpenAIEmbeddings
11
+ from langchain.vectorstores import Chroma
12
+ from langchain.chat_models import ChatOpenAI
13
+ from langchain.prompts import PromptTemplate
14
+ from collections import defaultdict
15
+
16
+ from vectorization import LangChainMultimodalVectorizer
17
+ from year_parser import YearParser
18
+ from config import *
19
+ load_dotenv()
20
+
21
+
22
+ class EnhancedMultimodalRAGSystem:
23
+ def __init__(self):
24
+ """Initialize enhanced RAG system with multimodal capabilities"""
25
+ self.vectorizer = LangChainMultimodalVectorizer()
26
+ self.llm = ChatOpenAI(
27
+ openai_api_key=os.getenv("OPENAI_API_KEY"),
28
+ model_name=os.getenv("OPENAI_MODEL", DEFAULT_LLM_MODEL),
29
+ temperature=LLM_TEMPERATURE,
30
+ max_tokens=MAX_TOKENS,
31
+ request_timeout=LLM_TIMEOUT
32
+ )
33
+ self.year_parser = YearParser()
34
+ self.COSINE_SIMILARITY_THRESHOLD = COSINE_SIMILARITY_THRESHOLD
35
+ self.MAX_SIMILAR_CONTEXT = MAX_SIMILAR_CONTEXT
36
+ self.VALID_YEARS = VALID_YEARS
37
+
38
+ # New: Context expansion settings
39
+ self.CONTEXT_EXPANSION_ENABLED = True
40
+ self.MAX_CONTEXT_CHUNKS_PER_SOURCE = 5 # Max additional chunks per source
41
+ self.CONTEXT_SIMILARITY_THRESHOLD = 0.7 # Similarity threshold for context expansion
42
+
43
+ if VERBOSE_LOGGING:
44
+ print(f"πŸš€ Enhanced Multimodal RAG System initialized")
45
+ print(f" 🧠 LLM Model: {os.getenv('OPENAI_MODEL', DEFAULT_LLM_MODEL)}")
46
+ print(f" πŸ“Š Cosine Similarity Threshold: {self.COSINE_SIMILARITY_THRESHOLD}")
47
+ print(f" πŸ“… Valid Years: {self.VALID_YEARS}")
48
+ print(f" πŸ”— Context Expansion: {self.CONTEXT_EXPANSION_ENABLED}")
49
+
50
+ def get_metadata_similarity_score(self, meta1: Dict, meta2: Dict) -> float:
51
+ """Calculate similarity score between two metadata objects"""
52
+ similarity_score = 0.0
53
+ total_weight = 0.0
54
+
55
+ # Define weights for different metadata fields
56
+ field_weights = {
57
+ 'year': 0.3,
58
+ 'page': 0.2,
59
+ 'program': 0.25,
60
+ 'semester': 0.15,
61
+ 'chapter': 0.2,
62
+ 'section': 0.15,
63
+ 'subsection': 0.1,
64
+ 'content_type': 0.2,
65
+ 'course_code': 0.15,
66
+ 'mata_kuliah': 0.15
67
+ }
68
+ for field, weight in field_weights.items():
69
+ if field in meta1 and field in meta2:
70
+ total_weight += weight
71
+ if field in ['year', 'page', 'semester']:
72
+ if meta1[field] == meta2[field]:
73
+ similarity_score += weight
74
+ elif field == 'page':
75
+ try:
76
+ page1, page2 = int(meta1[field]), int(meta2[field])
77
+ page_diff = abs(page1 - page2)
78
+ if page_diff == 0:
79
+ similarity_score += weight
80
+ elif page_diff <= 2:
81
+ similarity_score += weight * 0.5
82
+ except:
83
+ pass
84
+ else:
85
+ str1, str2 = str(meta1[field]).lower(), str(meta2[field]).lower()
86
+ if str1 == str2:
87
+ similarity_score += weight
88
+ elif str1 in str2 or str2 in str1:
89
+ similarity_score += weight * 0.7
90
+ return similarity_score / total_weight if total_weight > 0 else 0.0
91
+
92
+ def find_contextual_chunks(self, base_result: Dict, all_results: List[Dict]) -> List[Dict]:
93
+ base_metadata = base_result["metadata"]
94
+ contextual_chunks = []
95
+
96
+ for result in all_results:
97
+ if result["metadata"].get("id") == base_metadata.get("id"):
98
+ continue
99
+ if result["metadata"].get("year") != base_metadata.get("year"):
100
+ continue
101
+
102
+ similarity_score = self.get_metadata_similarity_score(base_metadata, result["metadata"])
103
+
104
+ if similarity_score >= self.CONTEXT_SIMILARITY_THRESHOLD:
105
+ result["context_similarity_score"] = similarity_score
106
+ contextual_chunks.append(result)
107
+
108
+ # Sort by similarity score and limit
109
+ contextual_chunks.sort(key=lambda x: x["context_similarity_score"], reverse=True)
110
+ return contextual_chunks[:self.MAX_CONTEXT_CHUNKS_PER_SOURCE]
111
+
112
+ def get_document_chunks_by_metadata(self, metadata: Dict, year: int) -> List[Dict]:
113
+ """Get all chunks from the same document/source with similar metadata"""
114
+ try:
115
+ # Build a more specific query based on metadata
116
+ search_filters = []
117
+
118
+ if metadata.get('program'):
119
+ search_filters.append(f"program:{metadata['program']}")
120
+ if metadata.get('semester'):
121
+ search_filters.append(f"semester:{metadata['semester']}")
122
+ if metadata.get('chapter'):
123
+ search_filters.append(f"chapter:{metadata['chapter']}")
124
+ if metadata.get('section'):
125
+ search_filters.append(f"section:{metadata['section']}")
126
+
127
+ # Create a search query from metadata
128
+ search_query = " ".join(search_filters) if search_filters else metadata.get('title', '')
129
+
130
+ # Get chunks from vectorstore with broader search
131
+ results = self.vectorizer.query_multimodal(
132
+ query_text=search_query,
133
+ year=year,
134
+ content_types=None,
135
+ n_results=20 # Get more results for context expansion
136
+ )
137
+
138
+ return results
139
+
140
+ except Exception as e:
141
+ print(f"❌ Error getting document chunks: {e}")
142
+ return []
143
+
144
+ def expand_context_for_results(self, initial_results: List[Dict]) -> List[Dict]:
145
+ """Expand context by finding related chunks for each initial result"""
146
+ if not self.CONTEXT_EXPANSION_ENABLED:
147
+ return initial_results
148
+
149
+ expanded_results = []
150
+ seen_ids = set()
151
+
152
+ for result in initial_results:
153
+ # Add the original result
154
+ result_id = result["metadata"].get("id", "")
155
+ if result_id not in seen_ids:
156
+ result["is_primary_result"] = True
157
+ expanded_results.append(result)
158
+ seen_ids.add(result_id)
159
+
160
+ # Find contextual chunks
161
+ year = result.get("search_year", result["metadata"].get("year"))
162
+ if year:
163
+ document_chunks = self.get_document_chunks_by_metadata(
164
+ result["metadata"], year
165
+ )
166
+
167
+ contextual_chunks = self.find_contextual_chunks(result, document_chunks)
168
+
169
+ # Add contextual chunks
170
+ for ctx_chunk in contextual_chunks:
171
+ ctx_id = ctx_chunk["metadata"].get("id", "")
172
+ if ctx_id not in seen_ids:
173
+ ctx_chunk["is_primary_result"] = False
174
+ ctx_chunk["parent_result_id"] = result_id
175
+ expanded_results.append(ctx_chunk)
176
+ seen_ids.add(ctx_id)
177
+
178
+ if VERBOSE_LOGGING:
179
+ print(f"πŸ”— Added contextual chunk for {result_id}: {ctx_id}")
180
+
181
+ if VERBOSE_LOGGING:
182
+ primary_count = sum(1 for r in expanded_results if r.get("is_primary_result", False))
183
+ context_count = len(expanded_results) - primary_count
184
+ print(
185
+ f"πŸ“ˆ Context expansion: {primary_count} primary + {context_count} contextual = {len(expanded_results)} total")
186
+
187
+ return expanded_results
188
+
189
+ def group_related_content(self, results: List[Dict]) -> Dict[str, List[Dict]]:
190
+ """Group results by their relationships (same document, similar metadata, etc.)"""
191
+ groups = defaultdict(list)
192
+
193
+ for result in results:
194
+ metadata = result["metadata"]
195
+
196
+ # Create grouping key based on metadata
197
+ group_key_parts = []
198
+
199
+ if metadata.get('program'):
200
+ group_key_parts.append(f"prog_{metadata['program']}")
201
+ if metadata.get('year'):
202
+ group_key_parts.append(f"year_{metadata['year']}")
203
+ if metadata.get('semester'):
204
+ group_key_parts.append(f"sem_{metadata['semester']}")
205
+ if metadata.get('chapter'):
206
+ group_key_parts.append(f"ch_{metadata['chapter']}")
207
+ if metadata.get('content_type'):
208
+ group_key_parts.append(f"type_{metadata['content_type']}")
209
+
210
+ group_key = "_".join(group_key_parts) if group_key_parts else "general"
211
+ groups[group_key].append(result)
212
+
213
+ return dict(groups)
214
+
215
+ def retrieve_multimodal_context_enhanced(self, query_context: Dict[str, Any], k: int = 10) -> List[Dict]:
216
+ """Enhanced retrieval with context expansion"""
217
+ all_results = []
218
+ content_strategies = {}
219
+
220
+ for content_type, ratio in CONTENT_TYPE_STRATEGIES.items():
221
+ content_strategies[content_type] = max(1, int(k * ratio))
222
+
223
+ if LOG_RETRIEVAL_DETAILS:
224
+ print(f"🎯 Content strategies: {content_strategies}")
225
+ print(f"πŸ“… Searching years: {query_context['years']}")
226
+
227
+ # Step 1: Get initial results
228
+ for year in query_context["years"]:
229
+ if year not in self.VALID_YEARS:
230
+ print(f"⚠️ Skipping invalid year: {year}")
231
+ continue
232
+
233
+ try:
234
+ if query_context.get("preferred_content_types"):
235
+ for content_type in query_context["preferred_content_types"]:
236
+ results = self.vectorizer.query_multimodal(
237
+ query_text=query_context["cleaned_query"],
238
+ year=year,
239
+ content_types=[content_type],
240
+ n_results=content_strategies.get(content_type, k//4)
241
+ )
242
+ for result in results:
243
+ result["search_year"] = year
244
+ result["content_priority"] = True
245
+ all_results.extend(results)
246
+
247
+ remaining_k = max(1, k - len(all_results))
248
+ general_results = self.vectorizer.query_multimodal(
249
+ query_text=query_context["cleaned_query"],
250
+ year=year,
251
+ content_types=None,
252
+ n_results=remaining_k
253
+ )
254
+ for result in general_results:
255
+ result["search_year"] = year
256
+ result["content_priority"] = False
257
+ all_results.extend(general_results)
258
+
259
+ except Exception as e:
260
+ print(f"❌ Error retrieving from year {year}: {e}")
261
+
262
+ # Step 2: deduplikasi
263
+ unique_results = self._deduplicate_and_rank_results(all_results, k)
264
+
265
+ # Step 3: Mencari konteks diluar dengana meta
266
+ expanded_results = self.expand_context_for_results(unique_results)
267
+
268
+ # Step 4: Final ranking and limiting
269
+ final_results = self._final_ranking_with_context(expanded_results, k * 2) # Allow more results due to context
270
+
271
+ if VERBOSE_LOGGING:
272
+ print(f"πŸ“š Final results with context: {len(final_results)}")
273
+
274
+ return final_results
275
+
276
+ def _final_ranking_with_context(self, results: List[Dict], max_results: int) -> List[Dict]:
277
+ """Final ranking that considers both primary results and their context"""
278
+ # Separate primary and contextual results
279
+ primary_results = [r for r in results if r.get("is_primary_result", True)]
280
+ contextual_results = [r for r in results if not r.get("is_primary_result", True)]
281
+
282
+ # Sort primary results by score
283
+ primary_results.sort(key=lambda x: x.get("score", 0), reverse=True)
284
+
285
+ # For each primary result, add its best contextual chunks
286
+ final_results = []
287
+ for primary in primary_results:
288
+ if len(final_results) >= max_results:
289
+ break
290
+
291
+ final_results.append(primary)
292
+
293
+ # Add related contextual chunks
294
+ primary_id = primary["metadata"].get("id", "")
295
+ related_contexts = [
296
+ r for r in contextual_results
297
+ if r.get("parent_result_id") == primary_id
298
+ ]
299
+
300
+ # Sort contextual chunks by their similarity score
301
+ related_contexts.sort(key=lambda x: x.get("context_similarity_score", 0), reverse=True)
302
+
303
+ # Add top contextual chunks
304
+ for ctx in related_contexts[:2]: # Limit to 2 contextual chunks per primary
305
+ if len(final_results) < max_results:
306
+ final_results.append(ctx)
307
+
308
+ return final_results
309
+
310
+ def format_enhanced_context_with_grouping(self, results: List[Dict]) -> str:
311
+ """Format context with grouping and relationship indicators"""
312
+ if not results:
313
+ return "Tidak ada informasi yang relevan ditemukan."
314
+
315
+ # Group related content
316
+ grouped_results = self.group_related_content(results)
317
+ context_parts = []
318
+
319
+ for group_key, group_results in grouped_results.items():
320
+ context_parts.append(f"\n{'='*60}")
321
+ context_parts.append(f"πŸ“‚ GRUP: {group_key.replace('_', ' ').upper()}")
322
+ context_parts.append(f"{'='*60}")
323
+
324
+ for i, result in enumerate(group_results, 1):
325
+ content_type = result["metadata"]["content_type"]
326
+ is_primary = result.get("is_primary_result", True)
327
+
328
+ # Add indicator for primary vs contextual
329
+ result_type = "🎯 PRIMARY" if is_primary else "πŸ”— CONTEXT"
330
+
331
+ # Enhanced formatting based on content type
332
+ if content_type == "table":
333
+ context_part = self.enhance_table_context_with_markdown(result)
334
+ elif content_type == "image":
335
+ context_part = self.enhance_image_context_with_details(result)
336
+ elif content_type == "silabus":
337
+ context_part = self.enhance_silabus_context_detailed(result)
338
+ elif content_type == "curriculum":
339
+ context_part = self.enhance_curriculum_context_detailed(result)
340
+ elif content_type == "text_chunk":
341
+ context_part = self.enhance_text_context_detailed(result)
342
+ else:
343
+ context_part = f"""
344
+ **KONTEN {content_type.upper()}:**
345
+ - **Tahun:** {result["metadata"].get('year', 'N/A')}
346
+ - **Halaman:** {result["metadata"].get('page', 'N/A')}
347
+ - **Context:** {result.get('context_text', '')[:200]}...
348
+
349
+ **Konten:**
350
+ {result['content'][:500]}...
351
+ """
352
+
353
+ header = f"**{result_type} SUMBER {i}:**"
354
+ if not is_primary:
355
+ similarity_score = result.get("context_similarity_score", 0)
356
+ header += f" (Similarity: {similarity_score:.2f})"
357
+
358
+ context_parts.append(f"{header}\n{context_part}")
359
+
360
+ return "\n\n".join(context_parts)
361
+
362
+ def _deduplicate_and_rank_results(self, all_results: List[Dict], k: int) -> List[Dict]:
363
+ seen_ids = set()
364
+ unique_results = []
365
+ sorted_results = sorted(
366
+ all_results,
367
+ key=lambda x: (x.get("score", 0), not x.get("content_priority", False))
368
+ )
369
+
370
+ content_type_counts = {}
371
+ max_per_type = max(1, k // len(CONTENT_TYPE_STRATEGIES))
372
+
373
+ for result in sorted_results:
374
+ result_id = result["metadata"].get("id", "")
375
+ content_type = result["metadata"]["content_type"]
376
+
377
+ # Skip duplicates
378
+ if result_id in seen_ids:
379
+ continue
380
+
381
+ # Limit per content type for diversity (unless priority content)
382
+ if not result.get("content_priority", False):
383
+ if content_type_counts.get(content_type, 0) >= max_per_type:
384
+ continue
385
+
386
+ seen_ids.add(result_id)
387
+ content_type_counts[content_type] = content_type_counts.get(content_type, 0) + 1
388
+
389
+ # Enhance with context_text
390
+ if "context_text" not in result:
391
+ result["context_text"] = result["metadata"].get("context_text", "")
392
+
393
+ unique_results.append(result)
394
+
395
+ if len(unique_results) >= k:
396
+ break
397
+
398
+ return unique_results
399
+
400
+ def enhance_table_context_with_markdown(self, result: Dict) -> str:
401
+ """Enhanced table context with markdown formatting"""
402
+ metadata = result["metadata"]
403
+ context_text = result.get("context_text", "")
404
+
405
+ enhanced_context = f"""
406
+ **TABEL ENHANCED:**
407
+ - **Judul:** {metadata.get('title', 'N/A')}
408
+ - **Ukuran:** {metadata.get('rows', 0)} baris Γ— {metadata.get('cols', 0)} kolom
409
+ - **Tahun:** {metadata.get('year', 'N/A')}
410
+ - **Halaman:** {metadata.get('page', 'N/A')}
411
+ - **Context:** {context_text}
412
+ - **Preview:** {result['content'][:300]}...
413
+
414
+ **Konten Lengkap:**
415
+ {result['content']}
416
+ """
417
+ return enhanced_context
418
+
419
+ def enhance_image_context_with_details(self, result: Dict) -> str:
420
+ """Enhanced image context with detailed metadata"""
421
+ metadata = result["metadata"]
422
+ context_text = result.get("context_text", "")
423
+
424
+ enhanced_context = f"""
425
+ **GAMBAR ENHANCED:**
426
+ - **Judul:** {metadata.get('title', 'N/A')}
427
+ - **Caption:** {metadata.get('caption', 'N/A')}
428
+ - **Tahun:** {metadata.get('year', 'N/A')}
429
+ - **Halaman:** {metadata.get('page', 'N/A')}
430
+ - **Context:** {context_text}
431
+ - **Deskripsi:** {result['content'][:300]}...
432
+
433
+ **Path Gambar:** {metadata.get('image_path', 'N/A')}
434
+ """
435
+ return enhanced_context
436
+
437
+ def enhance_silabus_context_detailed(self, result: Dict) -> str:
438
+ """Enhanced silabus context with comprehensive details"""
439
+ metadata = result["metadata"]
440
+ context_text = result.get("context_text", "")
441
+
442
+ enhanced_context = f"""
443
+ **SILABUS ENHANCED:**
444
+ - **Mata Kuliah:** {metadata.get('mata_kuliah', 'N/A')} ({metadata.get('course_code', 'N/A')})
445
+ - **Program Studi:** {metadata.get('program', 'N/A').title()}
446
+ - **Semester:** {metadata.get('semester', 'N/A')}
447
+ - **SKS:** {metadata.get('sks', 'N/A')}
448
+ - **Tipe Silabus:** {metadata.get('silabus_type', 'N/A')}
449
+ - **Tahun Kurikulum:** {metadata.get('year', 'N/A')}
450
+ - **Halaman:** {metadata.get('page', 'N/A')}
451
+ - **Context Text:** {context_text}
452
+
453
+ **Konten Lengkap:**
454
+ {result['content']}
455
+ """
456
+ return enhanced_context
457
+
458
+ def enhance_curriculum_context_detailed(self, result: Dict) -> str:
459
+ """Enhanced curriculum context with comprehensive details"""
460
+ metadata = result["metadata"]
461
+ context_text = result.get("context_text", "")
462
+
463
+ enhanced_context = f"""
464
+ **KURIKULUM ENHANCED:**
465
+ - **Program Studi:** {metadata.get('program', 'N/A').title()}
466
+ - **Semester:** {metadata.get('semester', 'N/A')}
467
+ - **Jenis Tabel:** {metadata.get('table_type', 'N/A')}
468
+ - **Jumlah Mata Kuliah:** {metadata.get('rows_count', 'N/A')}
469
+ - **Tahun Kurikulum:** {metadata.get('year', 'N/A')}
470
+ - **Halaman:** {metadata.get('page', 'N/A')}
471
+ - **Context Text:** {context_text}
472
+
473
+ **Konten Lengkap:**
474
+ {result['content']}
475
+ """
476
+ return enhanced_context
477
+
478
+ def enhance_text_context_detailed(self, result: Dict) -> str:
479
+ """Enhanced text context with comprehensive details"""
480
+ metadata = result["metadata"]
481
+ context_text = result.get("context_text", "")
482
+
483
+ enhanced_context = f"""
484
+ **TEKS ENHANCED:**
485
+ - **Bab:** {metadata.get('chapter', 'N/A')}
486
+ - **Bagian:** {metadata.get('section', 'N/A')}
487
+ - **Sub-bagian:** {metadata.get('subsection', 'N/A')}
488
+ - **Tahun:** {metadata.get('year', 'N/A')}
489
+ - **Halaman:** {metadata.get('page', 'N/A')}
490
+ - **Context Text:** {context_text}
491
+
492
+ **Konten Lengkap:**
493
+ {result['content']}
494
+ """
495
+ return enhanced_context
496
+
497
+ def format_enhanced_context(self, results: List[Dict]) -> str:
498
+ """Format context with comprehensive enhancements and grouping"""
499
+ return self.format_enhanced_context_with_grouping(results)
500
+
501
+ def generate_response(self, query: str, context: str, chat_history: List[Dict] = None) -> str:
502
+ """Generate response using LLM with context and chat history"""
503
+
504
+ # Prepare chat history context
505
+ chat_history_text = ""
506
+ if chat_history and len(chat_history) > 1:
507
+ recent_messages = chat_history[-CONTEXT_WINDOW_SIZE:]
508
+ chat_history_text = "\n\nRiwayat Percakapan Terakhir:\n"
509
+ for msg in recent_messages[:-1]: # Exclude current message
510
+ role = "User" if msg["role"] == "user" else "Assistant"
511
+ chat_history_text += f"{role}: {msg['content'][:200]}...\n"
512
+
513
+ # Enhanced prompt
514
+ enhanced_prompt = f"""
515
+ Anda adalah asisten akademik DTMI UGM yang membantu mahasiswa dan dosen.
516
+
517
+ {chat_history_text}
518
+
519
+ Pertanyaan Saat Ini: {query}
520
+
521
+ Konteks Informasi:
522
+ {context}
523
+
524
+ Instruksi:
525
+ 1. Berikan jawaban yang komprehensif dan akurat
526
+ 2. Gunakan informasi dari konteks yang relevan
527
+ 3. Jika merujuk ke tahun atau program studi, sebutkan secara spesifik
528
+ 4. Format jawaban dengan struktur yang jelas (gunakan bullet points, numbering jika perlu)
529
+ 5. Jika ada tabel atau data, jelaskan dengan detail
530
+ 6. Akhiri dengan saran atau informasi tambahan yang berguna
531
+ 7. Pertimbangkan konteks percakapan sebelumnya jika relevan
532
+ 8. Manfaatkan informasi kontekstual yang tersedia untuk memberikan jawaban yang lebih lengkap
533
+
534
+ Jawaban:
535
+ """
536
+ for attempt in range(MAX_RETRIES):
537
+ try:
538
+ response = self.llm.predict(enhanced_prompt)
539
+ return response
540
+ except Exception as e:
541
+ if attempt == MAX_RETRIES - 1:
542
+ return FALLBACK_RESPONSE
543
+ else:
544
+ import time
545
+ time.sleep(RETRY_DELAY)
546
+
547
+ return FALLBACK_RESPONSE
548
+
549
+ def parse_query_context(self, query: str) -> Dict[str, Any]:
550
+ """Parse query context with year extraction and content type detection"""
551
+ years, cleaned_query, user_mentioned_year, user_mentioned_invalid_year = self.year_parser.extract_years(query)
552
+ comparison_keywords = ["bandingkan", "banding", "perbandingan",
553
+ "dibanding", "vs", "versus", "perbedaan"]
554
+ year_comparison_mode = any(keyword in cleaned_query.lower()
555
+ for keyword in comparison_keywords) and len(years) > 1
556
+
557
+ content_type_hints = {
558
+ "silabus": ["silabus", "mata kuliah", "course", "sks", "pembelajaran", "materi"],
559
+ "curriculum": ["kurikulum", "curriculum", "semester", "program studi", "struktur"],
560
+ "table": ["tabel", "table", "data", "statistik", "daftar", "distribusi"],
561
+ "image": ["gambar", "image", "foto", "diagram", "struktur", "chart"],
562
+ "text_chunk": ["informasi", "penjelasan", "deskripsi", "detail", "tentang"]
563
+ }
564
+ preferred_types = []
565
+ query_lower = cleaned_query.lower()
566
+
567
+ for content_type, keywords in content_type_hints.items():
568
+ if any(keyword in query_lower for keyword in keywords):
569
+ preferred_types.append(content_type)
570
+
571
+ return {
572
+ "original_query": query,
573
+ "cleaned_query": cleaned_query,
574
+ "years": years,
575
+ "preferred_content_types": preferred_types,
576
+ "year_comparison_mode": year_comparison_mode
577
+ }
578
+
579
+ def query(self, question: str, k: int = 10, content_filter: List[str] = None) -> Dict[str, Any]:
580
+ years, cleaned_query, user_mentioned_year, user_mentioned_invalid_year = self.year_parser.extract_years(
581
+ question)
582
+ if user_mentioned_invalid_year and not years:
583
+ return {
584
+ "question": question,
585
+ "answer": "Maaf, informasi mengenai kurikulum tahun yang Anda minta tidak tersedia dalam konteks database ini.",
586
+ "context": "",
587
+ "sources": [],
588
+ "primary_sources": [],
589
+ "contextual_sources": [],
590
+ "years_searched": [],
591
+ "content_types_used": [],
592
+ "total_sources": 0,
593
+ "primary_sources_count": 0,
594
+ "contextual_sources_count": 0,
595
+ "has_images": False,
596
+ "has_tables": False,
597
+ "image_data": [],
598
+ "table_data": [],
599
+ "image_paths": [],
600
+ "table_paths": [],
601
+ "year_comparison_mode": False,
602
+ "context_expansion_enabled": self.CONTEXT_EXPANSION_ENABLED,
603
+ "processing_time": datetime.now().isoformat()
604
+ }
605
+ if VERBOSE_LOGGING:
606
+ print(f"πŸ” Processing query: {question}")
607
+ query_context = self.parse_query_context(question)
608
+ if content_filter:
609
+ query_context["preferred_content_types"] = content_filter
610
+ if LOG_RETRIEVAL_DETAILS:
611
+ print(f"πŸ“… Years: {query_context['years']}")
612
+ print(f"🎯 Content types: {query_context['preferred_content_types']}")
613
+ print(f"πŸ” Content filter: {content_filter}")
614
+ results = self.retrieve_multimodal_context_enhanced(query_context, k)
615
+ context = self.format_enhanced_context(results)
616
+ try:
617
+ response = self.generate_response(question, context)
618
+ except Exception as e:
619
+ print(f"❌ Error generating answer: {e}")
620
+ response = FALLBACK_RESPONSE
621
+ image_data = []
622
+ table_data = []
623
+
624
+ for result in results:
625
+ metadata = result["metadata"]
626
+ content_type = metadata.get("content_type", "")
627
+
628
+ # βœ… FILTER: HANYA AMBIL YANG PRIMARY SOURCES
629
+ is_primary = result.get("is_primary_result", True)
630
+ if not is_primary:
631
+ continue # Skip contextual sources
632
+
633
+ # πŸ–ΌοΈ EXTRACT IMAGE INFORMATION - HANYA PRIMARY
634
+ if content_type == "image":
635
+ original_image_path = metadata.get("image_path", "")
636
+ if original_image_path:
637
+ # Path fixing logic (sama seperti sebelumnya)
638
+ fixed_path = original_image_path
639
+ if fixed_path.startswith("./src/"):
640
+ fixed_path = fixed_path.replace("./src/", "./")
641
+ elif fixed_path.startswith("src/"):
642
+ fixed_path = fixed_path.replace("src/", "./")
643
+
644
+ if os.path.exists(fixed_path):
645
+ image_path = fixed_path
646
+ elif os.path.exists(original_image_path):
647
+ image_path = original_image_path
648
+ else:
649
+ alternatives = [
650
+ original_image_path.lstrip('./'),
651
+ f"../{original_image_path.lstrip('./')}",
652
+ original_image_path.replace("./src/", "../")
653
+ ]
654
+ image_path = None
655
+ for alt in alternatives:
656
+ if os.path.exists(alt):
657
+ image_path = alt
658
+ break
659
+
660
+ if not image_path:
661
+ image_path = original_image_path
662
+
663
+ if VERBOSE_LOGGING:
664
+ print(f"πŸ–ΌοΈ PRIMARY Image path resolution:")
665
+ print(f" Original: {original_image_path}")
666
+ print(f" Fixed: {image_path}")
667
+ print(f" Exists: {os.path.exists(image_path)}")
668
+
669
+ image_info = {
670
+ "path": image_path,
671
+ "original_path": original_image_path,
672
+ "title": metadata.get("title", "Gambar"),
673
+ "caption": metadata.get("caption", result['content'][:100] + "..."),
674
+ "page": metadata.get("page", "N/A"),
675
+ "year": metadata.get("year", "N/A"),
676
+ "description": result['content'][:200] + "..." if len(result['content']) > 200 else result['content'],
677
+ "score": result.get("score", 0.0),
678
+ "is_primary": True # Semua yang masuk ke sini adalah primary
679
+ }
680
+ image_data.append(image_info)
681
+ if VERBOSE_LOGGING:
682
+ print(f"πŸ–ΌοΈ Added PRIMARY image: {image_path}")
683
+
684
+ # πŸ“Š EXTRACT TABLE INFORMATION - HANYA PRIMARY
685
+ elif content_type == "table":
686
+ table_path = metadata.get("table_path", "")
687
+ if table_path and os.path.exists(table_path):
688
+ try:
689
+ table_info = {
690
+ "path": table_path,
691
+ "title": metadata.get("title", "Tabel"),
692
+ "page": metadata.get("page", "N/A"),
693
+ "year": metadata.get("year", "N/A"),
694
+ "rows": metadata.get("rows", 0),
695
+ "cols": metadata.get("cols", 0),
696
+ "description": result['content'][:200] + "..." if len(result['content']) > 200 else result['content'],
697
+ "score": result.get("score", 0.0),
698
+ "is_primary": True # Semua yang masuk ke sini adalah primary
699
+ }
700
+
701
+ # Load actual table data
702
+ if table_path.endswith('.csv'):
703
+ df = pd.read_csv(table_path)
704
+ table_info["data"] = df
705
+ table_info["data_type"] = "dataframe"
706
+ elif table_path.endswith('.json'):
707
+ with open(table_path, 'r', encoding='utf-8') as f:
708
+ json_data = json.load(f)
709
+ table_info["data"] = json_data
710
+ table_info["data_type"] = "json"
711
+
712
+ table_data.append(table_info)
713
+ if VERBOSE_LOGGING:
714
+ print(f"πŸ“Š Found PRIMARY table: {table_path}")
715
+
716
+ except Exception as e:
717
+ print(f"❌ Error loading table {table_path}: {e}")
718
+ primary_results = [r for r in results if r.get("is_primary_result", True)]
719
+ contextual_results = [r for r in results if not r.get("is_primary_result", True)]
720
+ response_data = {
721
+ "question": question,
722
+ "answer": response.strip(),
723
+ "context": context,
724
+ "sources": results,
725
+ "primary_sources": primary_results,
726
+ "contextual_sources": contextual_results,
727
+ "years_searched": query_context["years"],
728
+ "content_types_used": query_context["preferred_content_types"],
729
+ "total_sources": len(results),
730
+ "primary_sources_count": len(primary_results),
731
+ "contextual_sources_count": len(contextual_results),
732
+ "has_images": len(image_data) > 0,
733
+ "has_tables": len(table_data) > 0,
734
+ "image_data": image_data, # Full image metadata dengan path, title, etc
735
+ "table_data": table_data, # Loaded table data dengan DataFrame/JSON
736
+ "image_paths": [img["path"] for img in image_data],
737
+ "table_paths": [tbl["path"] for tbl in table_data],
738
+ "year_comparison_mode": query_context["year_comparison_mode"],
739
+ "context_expansion_enabled": self.CONTEXT_EXPANSION_ENABLED,
740
+ "processing_time": datetime.now().isoformat()
741
+ }
742
+
743
+ if VERBOSE_LOGGING:
744
+ print(f"βœ… Query processed successfully")
745
+ print(f"🎯 Primary sources: {len(primary_results)}")
746
+ print(f"πŸ”— Contextual sources: {len(contextual_results)}")
747
+ print(f"πŸ–ΌοΈ Images found: {len(image_data)}")
748
+ print(f"πŸ“Š Tables found: {len(table_data)}")
749
+ return response_data
750
+
751
+ def get_context_chain(self, result_id: str, max_depth: int = 3) -> List[Dict]:
752
+ """Get a chain of contextually related chunks starting from a specific result"""
753
+ try:
754
+ # This would work with your vectorstore to find chunks with similar metadata
755
+ # Implementation depends on your vectorstore structure
756
+ chain = []
757
+ current_id = result_id
758
+
759
+ for depth in range(max_depth):
760
+ # Find chunks with similar metadata to current chunk
761
+ similar_chunks = self.vectorizer.find_similar_by_metadata(current_id)
762
+ if not similar_chunks:
763
+ break
764
+
765
+ # Add the most similar chunk to chain
766
+ best_match = similar_chunks[0]
767
+ chain.append(best_match)
768
+ current_id = best_match["metadata"]["id"]
769
+
770
+ return chain
771
+
772
+ except Exception as e:
773
+ print(f"❌ Error building context chain: {e}")
774
+ return []
775
+
776
+ def get_full_document_context(self, metadata: Dict, year: int) -> str:
777
+ """Get comprehensive context from the entire document/source"""
778
+ try:
779
+ # Build document identifier
780
+ doc_identifiers = []
781
+
782
+ if metadata.get('program'):
783
+ doc_identifiers.append(metadata['program'])
784
+ if metadata.get('year'):
785
+ doc_identifiers.append(str(metadata['year']))
786
+ if metadata.get('chapter'):
787
+ doc_identifiers.append(metadata['chapter'])
788
+
789
+ # Search for all chunks from the same document
790
+ doc_query = " ".join(doc_identifiers)
791
+
792
+ # Get broader context
793
+ doc_chunks = self.vectorizer.query_multimodal(
794
+ query_text=doc_query,
795
+ year=year,
796
+ content_types=None,
797
+ n_results=50 # Get many chunks from same document
798
+ )
799
+
800
+ # Filter chunks that are actually from the same document
801
+ same_doc_chunks = []
802
+ for chunk in doc_chunks:
803
+ chunk_meta = chunk["metadata"]
804
+ similarity_score = self.get_metadata_similarity_score(metadata, chunk_meta)
805
+ if similarity_score > 0.5: # Adjust threshold as needed
806
+ same_doc_chunks.append(chunk)
807
+
808
+ # Sort by page number or similarity
809
+ same_doc_chunks.sort(key=lambda x: (
810
+ x["metadata"].get("page", 999),
811
+ x.get("score", 0)
812
+ ))
813
+
814
+ # Combine content with clear separators
815
+ full_context = ""
816
+ for i, chunk in enumerate(same_doc_chunks[:10]): # Limit to avoid token overflow
817
+ page = chunk["metadata"].get("page", "N/A")
818
+ content_type = chunk["metadata"].get("content_type", "unknown")
819
+ full_context += f"\n--- {content_type.upper()} (Page {page}) ---\n"
820
+ full_context += chunk["content"][:500] + "...\n"
821
+
822
+ return full_context
823
+
824
+ except Exception as e:
825
+ print(f"❌ Error getting full document context: {e}")
826
+ return ""
827
+
828
+ def advanced_context_retrieval(self, query_context: Dict[str, Any], k: int = 10) -> List[Dict]:
829
+ """Advanced retrieval that considers document structure and relationships"""
830
+
831
+ # Step 1: Get initial high-quality results
832
+ initial_results = self.retrieve_multimodal_context_enhanced(query_context, k//2)
833
+
834
+ # Step 2: For each high-quality result, get its document context
835
+ enhanced_results = []
836
+ seen_ids = set()
837
+
838
+ for result in initial_results:
839
+ result_id = result["metadata"].get("id", "")
840
+ if result_id in seen_ids:
841
+ continue
842
+
843
+ seen_ids.add(result_id)
844
+ result["context_level"] = "primary"
845
+ enhanced_results.append(result)
846
+
847
+ # Get document-level context
848
+ year = result.get("search_year", result["metadata"].get("year"))
849
+ if year:
850
+ doc_context = self.get_full_document_context(result["metadata"], year)
851
+ if doc_context:
852
+ # Create a synthetic result with full document context
853
+ doc_result = {
854
+ "content": doc_context,
855
+ "metadata": {
856
+ **result["metadata"],
857
+ "content_type": "document_context",
858
+ "id": f"{result_id}_doc_context"
859
+ },
860
+ "score": result.get("score", 0) * 0.8, # Slightly lower score
861
+ "context_level": "document",
862
+ "parent_id": result_id
863
+ }
864
+ enhanced_results.append(doc_result)
865
+
866
+ # Step 3: Fill remaining slots with diverse content
867
+ remaining_k = k - len(enhanced_results)
868
+ if remaining_k > 0:
869
+ additional_results = self.vectorizer.query_multimodal(
870
+ query_text=query_context["cleaned_query"],
871
+ year=query_context["years"][0] if query_context["years"] else 2024,
872
+ content_types=None,
873
+ n_results=remaining_k * 2
874
+ )
875
+
876
+ for add_result in additional_results:
877
+ add_id = add_result["metadata"].get("id", "")
878
+ if add_id not in seen_ids and len(enhanced_results) < k:
879
+ add_result["context_level"] = "supplementary"
880
+ enhanced_results.append(add_result)
881
+ seen_ids.add(add_id)
882
+
883
+ return enhanced_results[:k]
src/README.md CHANGED
@@ -1,3 +1,5 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:251cefeef9fdd6b038e53ada2062be34373d8ca262ef55aa1720c5035a9a4d1f
3
- size 124
 
 
 
1
+ python -m venv venv
2
+ source ./venv/bin/activate
3
+ pip install -r requirements
4
+ streamlit
5
+ untuk mengubah2 config bisa config.py
src/config.py CHANGED
@@ -1,3 +1,72 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:24f9c91b003fe3cb4a21e7f3a354a0cce9d230dfdf2d64f5ea0e0311c2ec65a1
3
- size 2451
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ COSINE_SIMILARITY_THRESHOLD = 0.80 # Threshold for text similarity
3
+ MAX_SIMILAR_CONTEXT = 5 # Maximum similar context to retrieve
4
+ VALID_YEARS = [2022, 2023, 2024] # Valid years for filtering
5
+ DEFAULT_SEARCH_YEARS = [2022, 2023, 2024] # Default if no year specified
6
+ DEFAULT_LLM_MODEL = "gpt-3.5-turbo"
7
+ LLM_TEMPERATURE = 0.1 # Low temperature for more consistent responses
8
+ MAX_TOKENS = 2000
9
+ LLM_TIMEOUT = 30
10
+ MAX_RETRIES = 3
11
+ RETRY_DELAY = 2
12
+ CONTENT_TYPE_STRATEGIES = {
13
+ "silabus": 0.3,
14
+ "curriculum": 0.25,
15
+ "table": 0.2,
16
+ "image": 0.1,
17
+ "text_chunk": 0.4
18
+ }
19
+ TABLE_MARKDOWN_CONFIG = {
20
+ "max_rows": 10,
21
+ "max_cols": 8,
22
+ "include_index": False,
23
+ "float_format": ":.2f"
24
+ }
25
+ VERBOSE_LOGGING = True
26
+ LOG_RETRIEVAL_DETAILS = True
27
+ MAX_CHAT_HISTORY = 20 # Maximum chat history to keep
28
+ CONTEXT_WINDOW_SIZE = 3 # Number of previous exchanges to include in context
29
+ FALLBACK_RESPONSE = """Maaf, terjadi kesalahan dalam menghasilkan jawaban.
30
+ Silakan coba dengan pertanyaan yang lebih spesifik atau hubungi administrator sistem.
31
+ Contoh pertanyaan yang bisa dicoba:
32
+ - "Mata kuliah semester 1 teknik mesin 2022"
33
+ - "Kurikulum teknik industri tahun 2023"
34
+ - "Tabel distribusi mata kuliah"
35
+ """
36
+ CONTENT_TYPE_DESCRIPTIONS = {
37
+ "silabus": "πŸ“š Silabus Mata Kuliah",
38
+ "curriculum": "πŸŽ“ Kurikulum Program Studi",
39
+ "table": "πŸ“Š Tabel & Data",
40
+ "image": "πŸ–ΌοΈ Gambar & Diagram",
41
+ "text_chunk": "πŸ“ Teks Umum"
42
+ }
43
+ EXAMPLE_QUERIES = {
44
+ "πŸ“Š Data & Tabel": [
45
+ "Tolong carikan format Cuti kuliah Tahun 2022",
46
+ "Jadwal mata kuliah semester genap",
47
+ "Tabel mata kuliah wajib dan pilihan",
48
+ "Prasyarat mata kuliah desain produk 2022 "
49
+ ],
50
+ "πŸ“š Kurikulum & Mata Kuliah": [
51
+ "Mata kuliah semester 1 teknik mesin tahun 2024",
52
+ ],
53
+ "πŸ“– Silabus & Detail Mata Kuliah": [
54
+ "Silabus mata kuliah Termodinamika",
55
+ "Detail pembelajaran Mekanika Fluida",
56
+ "Prasyarat mata kuliah Perancangan Produk"
57
+ ],
58
+
59
+ "πŸ” Perbandingan & Analisis": [
60
+ "Siapa Pengelola Layanan Akademik tahun 2022 ",
61
+ "Perbedaan kurikulum teknik mesin dan industri",
62
+ "Perubahan kurikulum dari 2022 ke 2024",
63
+ ],
64
+ "πŸ” Perbandingan & Analisis": [
65
+ "Siapa Pengelola Layanan Akademik tahun 2022 ",
66
+ "Perbedaan kurikulum teknik mesin dan industri",
67
+ "Perubahan kurikulum dari 2022 ke 2024",
68
+ "Aoa isi kurikulum teknik mesin 2026",
69
+
70
+ ]
71
+
72
+ }
src/streamlit_app.py CHANGED
@@ -1,3 +1,521 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:8972c77e60b6dc1d30de460bad24d2d4e82a80368b22bf702e29631052f96eed
3
- size 17808
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ from PIL import Image
4
+ import pandas as pd
5
+ import json
6
+ from datetime import datetime
7
+ from typing import List, Dict, Any, Optional
8
+ from RAG import EnhancedMultimodalRAGSystem
9
+ from config import *
10
+
11
+ # Page config
12
+ st.set_page_config(
13
+ page_title="DTMI UGM Academic Assistant",
14
+ page_icon="πŸŽ“",
15
+ layout="wide",
16
+ initial_sidebar_state="expanded"
17
+ )
18
+
19
+ # Enhanced CSS - ChatGPT Style
20
+ st.markdown("""
21
+ <style>
22
+ /* Main Header */
23
+ .main-header {
24
+ background: linear-gradient(135deg, #1e3c72 0%, #2a5298 50%, #4a90e2 100%);
25
+ padding: 2rem;
26
+ border-radius: 15px;
27
+ color: white;
28
+ text-align: center;
29
+ margin-bottom: 2rem;
30
+ box-shadow: 0 8px 32px rgba(0,0,0,0.1);
31
+ }
32
+
33
+ .main-header h1 {
34
+ margin-bottom: 0.5rem;
35
+ font-size: 2.5rem;
36
+ font-weight: 700;
37
+ }
38
+
39
+ .main-header p {
40
+ margin: 0.3rem 0;
41
+ opacity: 0.9;
42
+ }
43
+
44
+ /* Chat Messages - Hitam Putih Simple */
45
+ .user-message {
46
+ background: #2d2d2d;
47
+ color: white;
48
+ padding: 1.2rem;
49
+ border-radius: 15px;
50
+ margin: 1rem 0;
51
+ border-left: 5px solid #0084ff;
52
+ box-shadow: 0 4px 12px rgba(0,0,0,0.2);
53
+ animation: slideInRight 0.3s ease-out;
54
+ }
55
+
56
+ .assistant-message {
57
+ background: #f8f9fa;
58
+ color: #2d2d2d;
59
+ padding: 1.2rem;
60
+ border-radius: 15px;
61
+ margin: 1rem 0;
62
+ border-left: 5px solid #28a745;
63
+ box-shadow: 0 4px 12px rgba(0,0,0,0.1);
64
+ animation: slideInLeft 0.3s ease-out;
65
+ }
66
+
67
+ @keyframes slideInRight {
68
+ from { transform: translateX(20px); opacity: 0; }
69
+ to { transform: translateX(0); opacity: 1; }
70
+ }
71
+
72
+ @keyframes slideInLeft {
73
+ from { transform: translateX(-20px); opacity: 0; }
74
+ to { transform: translateX(0); opacity: 1; }
75
+ }
76
+
77
+ /* Example Queries */
78
+ .example-query {
79
+ background: #fff8e1;
80
+ color: #333;
81
+ padding: 1rem;
82
+ border-radius: 10px;
83
+ margin: 0.5rem 0;
84
+ border-left: 4px solid #ff9800;
85
+ cursor: pointer;
86
+ transition: all 0.3s ease;
87
+ box-shadow: 0 2px 8px rgba(255, 152, 0, 0.1);
88
+ }
89
+
90
+ .example-query:hover {
91
+ background: #ffecb3;
92
+ transform: translateY(-2px);
93
+ box-shadow: 0 4px 12px rgba(255, 152, 0, 0.2);
94
+ }
95
+
96
+ /* Source Preview */
97
+ .source-preview {
98
+ background: #f5f5f5;
99
+ color: #333;
100
+ padding: 1rem;
101
+ border-radius: 10px;
102
+ margin: 0.5rem 0;
103
+ font-size: 0.9em;
104
+ border-left: 3px solid #6c757d;
105
+ }
106
+
107
+ /* Buttons */
108
+ .stButton > button {
109
+ border-radius: 10px !important;
110
+ font-weight: 600 !important;
111
+ transition: all 0.3s ease !important;
112
+ }
113
+
114
+ .stButton > button:hover {
115
+ transform: translateY(-1px) !important;
116
+ box-shadow: 0 4px 12px rgba(0,0,0,0.15) !important;
117
+ }
118
+ .chat-container {
119
+ height: calc(100vh - 180px);
120
+ overflow-y: auto;
121
+ padding: 1rem;
122
+ border: 1px solid #e0e0e0;
123
+ border-radius: 10px;
124
+ background-color: #fafafa;
125
+ margin-bottom: 1rem;
126
+ }
127
+ .fixed-input {
128
+ position: fixed;
129
+ bottom: 2rem;
130
+ width: 60%;
131
+ max-width: 800px;
132
+ left: 50%;
133
+ transform: translateX(-50%);
134
+ background-color: white;
135
+ padding: 1rem;
136
+ border-radius: 10px;
137
+ box-shadow: 0 4px 16px rgba(0,0,0,0.1);
138
+ z-index: 999;
139
+ }
140
+ .spacer {
141
+ height: 120px; /* Tambahkan spacer agar konten tak tertutup input */
142
+ }
143
+ </style>
144
+ """, unsafe_allow_html=True)
145
+
146
+
147
+ @st.cache_resource
148
+ def initialize_rag_system():
149
+ try:
150
+ return EnhancedMultimodalRAGSystem()
151
+ except Exception as e:
152
+ st.error(f"❌ Error initializing RAG system: {e}")
153
+ st.stop()
154
+
155
+
156
+ def display_example_queries():
157
+ """Display clickable example queries"""
158
+ st.markdown("### πŸ’‘ Contoh Pertanyaan")
159
+
160
+ for category, queries in EXAMPLE_QUERIES.items():
161
+ with st.expander(f"{category}", expanded=True):
162
+ for query in queries:
163
+ if st.button(f"πŸ’¬ {query}", key=f"example_{hash(query)}", use_container_width=True):
164
+ st.session_state.user_input = query
165
+ st.rerun()
166
+
167
+
168
+ def display_tables_in_chat(table_data: List[Dict]):
169
+ """Display tables directly in chat"""
170
+ if not table_data:
171
+ return
172
+
173
+ st.markdown("### πŸ“Š Tabel Data")
174
+
175
+ for i, table_info in enumerate(table_data, 1):
176
+ with st.expander(f"πŸ“Š {table_info['title']} (Hal. {table_info['page']}, {table_info['year']})", expanded=True):
177
+
178
+ # Table metadata
179
+ col1, col2, col3 = st.columns(3)
180
+ with col1:
181
+ st.metric("πŸ“„ Halaman", table_info['page'])
182
+ with col2:
183
+ st.metric("πŸ“… Tahun", table_info['year'])
184
+ with col3:
185
+ st.metric("πŸ“Š Score", f"{table_info['score']:.3f}")
186
+ # Display table data
187
+ try:
188
+ if table_info.get("data_type") == "dataframe" and isinstance(table_info["data"], pd.DataFrame):
189
+ st.dataframe(table_info["data"], use_container_width=True)
190
+ # Download CSV
191
+ csv_data = table_info["data"].to_csv(index=False)
192
+ st.download_button(
193
+ label="πŸ’Ύ Download CSV",
194
+ data=csv_data,
195
+ file_name=f"table_{i}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv",
196
+ mime="text/csv"
197
+ )
198
+
199
+ elif table_info.get("data_type") == "json":
200
+ st.json(table_info["data"])
201
+
202
+ # Download JSON
203
+ json_str = json.dumps(table_info["data"], indent=2, ensure_ascii=False)
204
+ st.download_button(
205
+ label="πŸ’Ύ Download JSON",
206
+ data=json_str,
207
+ file_name=f"data_{i}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json",
208
+ mime="application/json"
209
+ )
210
+
211
+ # Show description
212
+ if table_info.get('description'):
213
+ st.markdown("**πŸ“ Deskripsi:**")
214
+ st.text(table_info['description'])
215
+
216
+ except Exception as e:
217
+ st.error(f"❌ Error displaying table: {e}")
218
+
219
+
220
+ def display_single_image_compact(img_info: Dict, index: int):
221
+ """Display single image in compact format - CLEAN VERSION"""
222
+ try:
223
+ image_path = img_info["path"]
224
+
225
+ # Check if file exists
226
+ if not os.path.exists(image_path):
227
+ st.error(f"❌ Gambar {index} tidak ditemukan")
228
+ return
229
+
230
+ # Load and display image
231
+ image = Image.open(image_path)
232
+
233
+ # Display image with nice styling
234
+ st.image(image,
235
+ caption=f"πŸ“– {img_info.get('title', 'Gambar')} - Hal. {img_info.get('page', 'N/A')} ({img_info.get('year', 'N/A')})",
236
+ use_container_width=True)
237
+
238
+ # Compact metadata
239
+ col1, col2 = st.columns(2)
240
+ with col1:
241
+ st.metric("πŸ“Š Relevance Score", f"None")
242
+ # {img_info.get('score', 0):.2f}")
243
+ with col2:
244
+ st.metric("πŸ“ Ukuran", f"{image.width}Γ—{image.height}px")
245
+
246
+ # Expandable details
247
+ with st.expander(f"πŸ“ Detail Gambar {index}", expanded=False):
248
+ if img_info.get('description'):
249
+ st.markdown("**πŸ“„ Deskripsi:**")
250
+ st.text(img_info['description'])
251
+ if img_info.get('caption'):
252
+ st.markdown("**πŸ’¬ Caption:**")
253
+ st.text(img_info['caption'])
254
+
255
+ except Exception as e:
256
+ st.error(f"❌ Error loading image {index}: {str(e)}")
257
+
258
+
259
+ def display_single_image_full(img_info: Dict):
260
+ """Display single image in full format - CLEAN VERSION"""
261
+ try:
262
+ image_path = img_info["path"]
263
+
264
+ if not os.path.exists(image_path):
265
+ st.error("❌ Gambar tidak ditemukan")
266
+ return
267
+
268
+ # Load image
269
+ image = Image.open(image_path)
270
+
271
+ # Display with title
272
+ st.markdown(f"### πŸ–ΌοΈ {img_info.get('title', 'Gambar')}")
273
+
274
+ # Create columns for image and metadata
275
+ col1, col2 = st.columns([3, 1])
276
+
277
+ with col1:
278
+ st.image(image, use_column_width=True)
279
+
280
+ with col2:
281
+ st.markdown("**πŸ“‹ Informasi Gambar**")
282
+ st.metric("πŸ“„ Halaman", img_info.get('page', 'N/A'))
283
+ st.metric("πŸ“… Tahun", img_info.get('year', 'N/A'))
284
+ # st.metric("πŸ“Š Score", f"{img_info.get('score', 0):.3f}")
285
+ st.metric("πŸ“ Dimensi", f"{image.width} Γ— {image.height}")
286
+
287
+ # Download button
288
+ with open(image_path, "rb") as file:
289
+ st.download_button(
290
+ label="πŸ’Ύ Download Gambar",
291
+ data=file.read(),
292
+ file_name=os.path.basename(image_path),
293
+ mime="image/png",
294
+ use_container_width=True
295
+ )
296
+
297
+ # Description below image
298
+ if img_info.get('description'):
299
+ st.markdown("**πŸ“ Deskripsi Gambar:**")
300
+ st.info(img_info['description'])
301
+
302
+ if img_info.get('caption'):
303
+ st.markdown("**πŸ’¬ Caption:**")
304
+ st.info(img_info['caption'])
305
+
306
+ except Exception as e:
307
+ st.error(f"❌ Error loading image: {str(e)}")
308
+
309
+
310
+ def display_images_in_chat(image_data: List[Dict], show_details: bool = True):
311
+ """Display images directly in chat - CLEAN VERSION"""
312
+ if not image_data:
313
+ return
314
+
315
+ st.markdown("### πŸ–ΌοΈ Gambar Terkait")
316
+ if len(image_data) == 1:
317
+ st.markdown(f"*Ditemukan 1 gambar relevan*")
318
+ else:
319
+ st.markdown(f"*Ditemukan {len(image_data)} gambar relevan*")
320
+ if len(image_data) > 1:
321
+ cols = st.columns(min(len(image_data), 2)) # Max 2 columns
322
+ for i, img_info in enumerate(image_data):
323
+ with cols[i % 2]:
324
+ display_single_image_compact(img_info, i+1)
325
+ else:
326
+ display_single_image_full(image_data[0])
327
+
328
+
329
+ def enhanced_chat_interface():
330
+ if 'messages' not in st.session_state:
331
+ st.session_state.messages = []
332
+ if 'user_input' not in st.session_state:
333
+ st.session_state.user_input = ""
334
+ rag_system = initialize_rag_system()
335
+ st.markdown("""
336
+ <div class="main-header">
337
+ <h1>πŸŽ“ DTMI UGM Academic Assistant</h1>
338
+ <p>Asisten Cerdas Multimodal untuk Informasi Akademik DTMI UGM</p>
339
+ <p>πŸ’¬ Tanyakan apapun tentang kurikulum, silabus, gambar, dan tabel data</p>
340
+ </div>
341
+ """, unsafe_allow_html=True)
342
+
343
+ # Sidebar with controls
344
+ with st.sidebar:
345
+ st.markdown("### βš™οΈ Pengaturan")
346
+
347
+ # Content type preferences
348
+ st.markdown("### 🎯 Preferensi Konten")
349
+ content_preferences = []
350
+ for content_type, description in CONTENT_TYPE_DESCRIPTIONS.items():
351
+ if st.checkbox(description, key=f"pref_{content_type}"):
352
+ content_preferences.append(content_type)
353
+
354
+ # Retrieval settings
355
+ st.markdown("### πŸ” Pengaturan Pencarian")
356
+ max_results = st.slider("Jumlah Konteks Maksimal", 5, 20, 10)
357
+
358
+ # Display settings
359
+ st.markdown("### 🎭 Tampilan")
360
+ show_images_inline = st.checkbox("πŸ–ΌοΈ Tampilkan Gambar", value=True)
361
+ show_tables_inline = st.checkbox("πŸ“Š Tampilkan Tabel", value=True)
362
+ compact_mode = st.checkbox("πŸ“± Mode Kompak", value=False)
363
+
364
+ # Chat statistics
365
+ if st.session_state.messages:
366
+ st.markdown("### πŸ“Š Statistik")
367
+ total_messages = len(st.session_state.messages)
368
+ st.metric("πŸ’¬ Total Pesan", total_messages)
369
+ st.metric("πŸ—£οΈ Percakapan", total_messages // 2)
370
+
371
+ # Clear chat
372
+ if st.button("πŸ—‘οΈ Hapus Chat", type="secondary", use_container_width=True):
373
+ st.session_state.messages = []
374
+ st.rerun()
375
+
376
+ # Main chat area
377
+ col1, col2 = st.columns([3, 1] if not compact_mode else [1, 0])
378
+
379
+ with col1:
380
+ # Display chat history
381
+
382
+ for message in st.session_state.messages:
383
+ if message["role"] == "user":
384
+ st.markdown(f"""
385
+ <div class="user-message">
386
+ <strong>πŸ‘€ Anda:</strong><br>
387
+ {message["content"]}
388
+ </div>
389
+ """, unsafe_allow_html=True)
390
+ else:
391
+ st.markdown(f"""
392
+ <div class="assistant-message">
393
+ <strong>πŸ€– Assistant:</strong><br>
394
+ {message["content"]}
395
+ </div>
396
+ """, unsafe_allow_html=True)
397
+
398
+ # 🎯 DISPLAY MULTIMODAL CONTENT
399
+ if "result_data" in message:
400
+ result_data = message["result_data"]
401
+
402
+ # Show quick stats if has multimodal content
403
+ if result_data.get("has_images") or result_data.get("has_tables"):
404
+ st.markdown("---") # Separator
405
+
406
+ col_stats1, col_stats2, col_stats3 = st.columns(3)
407
+ with col_stats1:
408
+ st.metric("πŸ–ΌοΈ Gambar", len(result_data.get("image_data", [])))
409
+ with col_stats2:
410
+ st.metric("πŸ“Š Tabel", len(result_data.get("table_data", [])))
411
+ with col_stats3:
412
+ st.metric("πŸ“š Sumber", result_data.get("total_sources", 0))
413
+
414
+ # πŸ–ΌοΈ DISPLAY IMAGES
415
+ if show_images_inline and result_data.get("has_images"):
416
+ display_images_in_chat(result_data.get("image_data", []))
417
+
418
+ # πŸ“Š DISPLAY TABLES
419
+ if show_tables_inline and result_data.get("has_tables"):
420
+ display_tables_in_chat(result_data.get("table_data", []))
421
+
422
+ # Collapsible sources
423
+ if "sources" in message and message["sources"]:
424
+ with st.expander("πŸ“š Lihat Sumber Informasi", expanded=False):
425
+ for i, source in enumerate(message["sources"][:3], 1):
426
+ content_type = source['metadata']['content_type']
427
+ year = source['metadata'].get('year', 'N/A')
428
+ page = source['metadata'].get('page', 'N/A')
429
+
430
+ st.markdown(f"""
431
+ **πŸ“‹ Sumber {i}:** {CONTENT_TYPE_DESCRIPTIONS.get(content_type, content_type)}
432
+ **πŸ“… Tahun:** {year} | **πŸ“„ Halaman:** {page}
433
+ **πŸ“ Preview:** {source['content'][:150]}...
434
+ """)
435
+ st.markdown("---")
436
+
437
+ # Chat input
438
+ user_input = st.chat_input(
439
+ "πŸ’¬ Tanyakan tentang kurikulum, gambar, tabel, atau informasi lainnya...", key="chat_input")
440
+
441
+ # Handle example query selection
442
+ if st.session_state.user_input:
443
+ user_input = st.session_state.user_input
444
+ st.session_state.user_input = ""
445
+
446
+ # πŸš€ PROCESS USER INPUT
447
+ if user_input:
448
+ # Add user message
449
+ st.session_state.messages.append({"role": "user", "content": user_input})
450
+
451
+ # Show loading
452
+ with st.spinner("πŸ” Mencari informasi relevan..."):
453
+ try:
454
+ result_data = rag_system.query(
455
+ user_input,
456
+ k=max_results,
457
+ content_filter=content_preferences if content_preferences else None
458
+ )
459
+
460
+ # Save assistant message with complete data
461
+ assistant_message = {
462
+ "role": "assistant",
463
+ "content": result_data["answer"],
464
+ "sources": result_data["sources"],
465
+ "result_data": result_data
466
+ }
467
+
468
+ st.session_state.messages.append(assistant_message)
469
+
470
+ except Exception as e:
471
+ st.error(f"❌ Terjadi kesalahan: {e}")
472
+ st.session_state.messages.append({
473
+ "role": "assistant",
474
+ "content": "Maaf, terjadi kesalahan dalam memproses pertanyaan Anda. Silakan coba lagi."
475
+ })
476
+
477
+ st.rerun()
478
+
479
+ # Sidebar dengan example queries (only if not compact)
480
+ if not compact_mode:
481
+ with col2:
482
+ display_example_queries()
483
+
484
+ # Quick actions
485
+ st.markdown("### ⚑ Aksi Cepat")
486
+
487
+ quick_actions = [
488
+ ("πŸ–ΌοΈ Cari Gambar", "Tampilkan gambar formulir atau diagram"),
489
+ ("πŸ“Š Lihat Tabel", "Tabel kurikulum semester 1"),
490
+ ("πŸŽ“ Info Program", "Informasi program studi teknik mesin"),
491
+ ("πŸ“š Silabus", "Silabus mata kuliah wajib")
492
+ ]
493
+
494
+ for label, query in quick_actions:
495
+ if st.button(label, use_container_width=True):
496
+ st.session_state.user_input = query
497
+ st.rerun()
498
+ if st.session_state.messages:
499
+ st.markdown("### πŸ“€ Export")
500
+ if st.button("πŸ’Ύ Download Chat", use_container_width=True):
501
+ chat_export = ""
502
+ for msg in st.session_state.messages:
503
+ role = "User" if msg["role"] == "user" else "Assistant"
504
+ chat_export += f"**{role}:** {msg['content']}\n\n"
505
+
506
+ st.download_button(
507
+ label="πŸ“„ Download Markdown",
508
+ data=chat_export,
509
+ file_name=f"chat_dtmi_{datetime.now().strftime('%Y%m%d_%H%M%S')}.md",
510
+ mime="text/markdown",
511
+ use_container_width=True
512
+ )
513
+
514
+
515
+ def main():
516
+ """Main application function"""
517
+ enhanced_chat_interface()
518
+
519
+
520
+ if __name__ == "__main__":
521
+ main()
src/year_parser.py CHANGED
@@ -1,3 +1,60 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:dd9534ffe3bedced91e0cfe3fb3ccc2c7cc7a7d88a85f301607d4185752db0bc
3
- size 2316
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import re
3
+ from typing import List, Tuple
4
+ from config import *
5
+
6
+
7
+ class YearParser:
8
+ VALID_YEARS = [2022, 2023, 2024]
9
+
10
+ @staticmethod
11
+ def extract_years(query: str) -> Tuple[List[int], str, bool, bool]:
12
+ years = []
13
+ cleaned_query = query
14
+ user_mentioned_year = False
15
+ user_mentioned_invalid_year = False
16
+
17
+ single_year_pattern = r'\b(20\d{2})\b'
18
+ single_years = re.findall(single_year_pattern, query)
19
+
20
+ range_patterns = [
21
+ r'\b(20\d{2})\s*-\s*(20\d{2})\b', # 2022-2024
22
+ r'\b(20\d{2})\s+sampai\s+(20\d{2})\b', # 2022 sampai 2024
23
+ r'\b(20\d{2})\s+hingga\s+(20\d{2})\b', # 2022 hingga 2024
24
+ r'\b(20\d{2})\s+s\.?d\.?\s+(20\d{2})\b', # 2022 s.d 2024
25
+ ]
26
+ range_found = False
27
+
28
+ for pattern in range_patterns:
29
+ matches = re.findall(pattern, query, re.IGNORECASE)
30
+ if matches:
31
+ user_mentioned_year = True
32
+ for start_year, end_year in matches:
33
+ start = int(start_year)
34
+ end = int(end_year)
35
+ for year in range(start, end + 1):
36
+ if year in YearParser.VALID_YEARS:
37
+ years.append(year)
38
+ else:
39
+ user_mentioned_invalid_year = True
40
+ range_found = True
41
+ cleaned_query = re.sub(pattern, '', cleaned_query, flags=re.IGNORECASE)
42
+
43
+ if not range_found and single_years:
44
+ user_mentioned_year = True
45
+ for year in single_years:
46
+ y = int(year)
47
+ if y in YearParser.VALID_YEARS:
48
+ years.append(y)
49
+ else:
50
+ user_mentioned_invalid_year = True
51
+ cleaned_query = re.sub(single_year_pattern, '', cleaned_query)
52
+
53
+ # Tidak fallback ke semua tahun valid kalau user_mentioned_year True tapi semua tahun tidak valid
54
+ if not years and not user_mentioned_year:
55
+ years = YearParser.VALID_YEARS.copy()
56
+
57
+ cleaned_query = re.sub(r'\s+', ' ', cleaned_query).strip()
58
+ cleaned_query = re.sub(r'^[,\-\s]+|[,\-\s]+$', '', cleaned_query)
59
+
60
+ return list(sorted(set(years))), cleaned_query, user_mentioned_year, user_mentioned_invalid_year