Spaces:
Build error
Build error
Commit Β·
1b060e0
1
Parent(s): bdc2b18
Untrack .py and .md from LFS and restore as regular files
Browse files- README.md +17 -3
- src/RAG.py +883 -3
- src/README.md +5 -3
- src/config.py +72 -3
- src/streamlit_app.py +521 -3
- src/year_parser.py +60 -3
README.md
CHANGED
|
@@ -1,3 +1,17 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Optima
|
| 3 |
+
emoji: "π"
|
| 4 |
+
colorFrom: red
|
| 5 |
+
colorTo: red
|
| 6 |
+
sdk: docker
|
| 7 |
+
app_port: 8501
|
| 8 |
+
tags:
|
| 9 |
+
- streamlit
|
| 10 |
+
pinned: false
|
| 11 |
+
app_file: src/streamlit_app.py
|
| 12 |
+
short_description: Streamlit template space
|
| 13 |
+
---
|
| 14 |
+
|
| 15 |
+
# Welcome to Optima
|
| 16 |
+
|
| 17 |
+
test
|
src/RAG.py
CHANGED
|
@@ -1,3 +1,883 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
+
import pandas as pd
|
| 4 |
+
from typing import List, Dict, Any, Optional, Tuple, Set
|
| 5 |
+
from datetime import datetime
|
| 6 |
+
from dotenv import load_dotenv
|
| 7 |
+
import numpy as np
|
| 8 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 9 |
+
import re
|
| 10 |
+
from langchain.embeddings import OpenAIEmbeddings
|
| 11 |
+
from langchain.vectorstores import Chroma
|
| 12 |
+
from langchain.chat_models import ChatOpenAI
|
| 13 |
+
from langchain.prompts import PromptTemplate
|
| 14 |
+
from collections import defaultdict
|
| 15 |
+
|
| 16 |
+
from vectorization import LangChainMultimodalVectorizer
|
| 17 |
+
from year_parser import YearParser
|
| 18 |
+
from config import *
|
| 19 |
+
load_dotenv()
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class EnhancedMultimodalRAGSystem:
|
| 23 |
+
def __init__(self):
|
| 24 |
+
"""Initialize enhanced RAG system with multimodal capabilities"""
|
| 25 |
+
self.vectorizer = LangChainMultimodalVectorizer()
|
| 26 |
+
self.llm = ChatOpenAI(
|
| 27 |
+
openai_api_key=os.getenv("OPENAI_API_KEY"),
|
| 28 |
+
model_name=os.getenv("OPENAI_MODEL", DEFAULT_LLM_MODEL),
|
| 29 |
+
temperature=LLM_TEMPERATURE,
|
| 30 |
+
max_tokens=MAX_TOKENS,
|
| 31 |
+
request_timeout=LLM_TIMEOUT
|
| 32 |
+
)
|
| 33 |
+
self.year_parser = YearParser()
|
| 34 |
+
self.COSINE_SIMILARITY_THRESHOLD = COSINE_SIMILARITY_THRESHOLD
|
| 35 |
+
self.MAX_SIMILAR_CONTEXT = MAX_SIMILAR_CONTEXT
|
| 36 |
+
self.VALID_YEARS = VALID_YEARS
|
| 37 |
+
|
| 38 |
+
# New: Context expansion settings
|
| 39 |
+
self.CONTEXT_EXPANSION_ENABLED = True
|
| 40 |
+
self.MAX_CONTEXT_CHUNKS_PER_SOURCE = 5 # Max additional chunks per source
|
| 41 |
+
self.CONTEXT_SIMILARITY_THRESHOLD = 0.7 # Similarity threshold for context expansion
|
| 42 |
+
|
| 43 |
+
if VERBOSE_LOGGING:
|
| 44 |
+
print(f"π Enhanced Multimodal RAG System initialized")
|
| 45 |
+
print(f" π§ LLM Model: {os.getenv('OPENAI_MODEL', DEFAULT_LLM_MODEL)}")
|
| 46 |
+
print(f" π Cosine Similarity Threshold: {self.COSINE_SIMILARITY_THRESHOLD}")
|
| 47 |
+
print(f" π
Valid Years: {self.VALID_YEARS}")
|
| 48 |
+
print(f" π Context Expansion: {self.CONTEXT_EXPANSION_ENABLED}")
|
| 49 |
+
|
| 50 |
+
def get_metadata_similarity_score(self, meta1: Dict, meta2: Dict) -> float:
|
| 51 |
+
"""Calculate similarity score between two metadata objects"""
|
| 52 |
+
similarity_score = 0.0
|
| 53 |
+
total_weight = 0.0
|
| 54 |
+
|
| 55 |
+
# Define weights for different metadata fields
|
| 56 |
+
field_weights = {
|
| 57 |
+
'year': 0.3,
|
| 58 |
+
'page': 0.2,
|
| 59 |
+
'program': 0.25,
|
| 60 |
+
'semester': 0.15,
|
| 61 |
+
'chapter': 0.2,
|
| 62 |
+
'section': 0.15,
|
| 63 |
+
'subsection': 0.1,
|
| 64 |
+
'content_type': 0.2,
|
| 65 |
+
'course_code': 0.15,
|
| 66 |
+
'mata_kuliah': 0.15
|
| 67 |
+
}
|
| 68 |
+
for field, weight in field_weights.items():
|
| 69 |
+
if field in meta1 and field in meta2:
|
| 70 |
+
total_weight += weight
|
| 71 |
+
if field in ['year', 'page', 'semester']:
|
| 72 |
+
if meta1[field] == meta2[field]:
|
| 73 |
+
similarity_score += weight
|
| 74 |
+
elif field == 'page':
|
| 75 |
+
try:
|
| 76 |
+
page1, page2 = int(meta1[field]), int(meta2[field])
|
| 77 |
+
page_diff = abs(page1 - page2)
|
| 78 |
+
if page_diff == 0:
|
| 79 |
+
similarity_score += weight
|
| 80 |
+
elif page_diff <= 2:
|
| 81 |
+
similarity_score += weight * 0.5
|
| 82 |
+
except:
|
| 83 |
+
pass
|
| 84 |
+
else:
|
| 85 |
+
str1, str2 = str(meta1[field]).lower(), str(meta2[field]).lower()
|
| 86 |
+
if str1 == str2:
|
| 87 |
+
similarity_score += weight
|
| 88 |
+
elif str1 in str2 or str2 in str1:
|
| 89 |
+
similarity_score += weight * 0.7
|
| 90 |
+
return similarity_score / total_weight if total_weight > 0 else 0.0
|
| 91 |
+
|
| 92 |
+
def find_contextual_chunks(self, base_result: Dict, all_results: List[Dict]) -> List[Dict]:
|
| 93 |
+
base_metadata = base_result["metadata"]
|
| 94 |
+
contextual_chunks = []
|
| 95 |
+
|
| 96 |
+
for result in all_results:
|
| 97 |
+
if result["metadata"].get("id") == base_metadata.get("id"):
|
| 98 |
+
continue
|
| 99 |
+
if result["metadata"].get("year") != base_metadata.get("year"):
|
| 100 |
+
continue
|
| 101 |
+
|
| 102 |
+
similarity_score = self.get_metadata_similarity_score(base_metadata, result["metadata"])
|
| 103 |
+
|
| 104 |
+
if similarity_score >= self.CONTEXT_SIMILARITY_THRESHOLD:
|
| 105 |
+
result["context_similarity_score"] = similarity_score
|
| 106 |
+
contextual_chunks.append(result)
|
| 107 |
+
|
| 108 |
+
# Sort by similarity score and limit
|
| 109 |
+
contextual_chunks.sort(key=lambda x: x["context_similarity_score"], reverse=True)
|
| 110 |
+
return contextual_chunks[:self.MAX_CONTEXT_CHUNKS_PER_SOURCE]
|
| 111 |
+
|
| 112 |
+
def get_document_chunks_by_metadata(self, metadata: Dict, year: int) -> List[Dict]:
|
| 113 |
+
"""Get all chunks from the same document/source with similar metadata"""
|
| 114 |
+
try:
|
| 115 |
+
# Build a more specific query based on metadata
|
| 116 |
+
search_filters = []
|
| 117 |
+
|
| 118 |
+
if metadata.get('program'):
|
| 119 |
+
search_filters.append(f"program:{metadata['program']}")
|
| 120 |
+
if metadata.get('semester'):
|
| 121 |
+
search_filters.append(f"semester:{metadata['semester']}")
|
| 122 |
+
if metadata.get('chapter'):
|
| 123 |
+
search_filters.append(f"chapter:{metadata['chapter']}")
|
| 124 |
+
if metadata.get('section'):
|
| 125 |
+
search_filters.append(f"section:{metadata['section']}")
|
| 126 |
+
|
| 127 |
+
# Create a search query from metadata
|
| 128 |
+
search_query = " ".join(search_filters) if search_filters else metadata.get('title', '')
|
| 129 |
+
|
| 130 |
+
# Get chunks from vectorstore with broader search
|
| 131 |
+
results = self.vectorizer.query_multimodal(
|
| 132 |
+
query_text=search_query,
|
| 133 |
+
year=year,
|
| 134 |
+
content_types=None,
|
| 135 |
+
n_results=20 # Get more results for context expansion
|
| 136 |
+
)
|
| 137 |
+
|
| 138 |
+
return results
|
| 139 |
+
|
| 140 |
+
except Exception as e:
|
| 141 |
+
print(f"β Error getting document chunks: {e}")
|
| 142 |
+
return []
|
| 143 |
+
|
| 144 |
+
def expand_context_for_results(self, initial_results: List[Dict]) -> List[Dict]:
|
| 145 |
+
"""Expand context by finding related chunks for each initial result"""
|
| 146 |
+
if not self.CONTEXT_EXPANSION_ENABLED:
|
| 147 |
+
return initial_results
|
| 148 |
+
|
| 149 |
+
expanded_results = []
|
| 150 |
+
seen_ids = set()
|
| 151 |
+
|
| 152 |
+
for result in initial_results:
|
| 153 |
+
# Add the original result
|
| 154 |
+
result_id = result["metadata"].get("id", "")
|
| 155 |
+
if result_id not in seen_ids:
|
| 156 |
+
result["is_primary_result"] = True
|
| 157 |
+
expanded_results.append(result)
|
| 158 |
+
seen_ids.add(result_id)
|
| 159 |
+
|
| 160 |
+
# Find contextual chunks
|
| 161 |
+
year = result.get("search_year", result["metadata"].get("year"))
|
| 162 |
+
if year:
|
| 163 |
+
document_chunks = self.get_document_chunks_by_metadata(
|
| 164 |
+
result["metadata"], year
|
| 165 |
+
)
|
| 166 |
+
|
| 167 |
+
contextual_chunks = self.find_contextual_chunks(result, document_chunks)
|
| 168 |
+
|
| 169 |
+
# Add contextual chunks
|
| 170 |
+
for ctx_chunk in contextual_chunks:
|
| 171 |
+
ctx_id = ctx_chunk["metadata"].get("id", "")
|
| 172 |
+
if ctx_id not in seen_ids:
|
| 173 |
+
ctx_chunk["is_primary_result"] = False
|
| 174 |
+
ctx_chunk["parent_result_id"] = result_id
|
| 175 |
+
expanded_results.append(ctx_chunk)
|
| 176 |
+
seen_ids.add(ctx_id)
|
| 177 |
+
|
| 178 |
+
if VERBOSE_LOGGING:
|
| 179 |
+
print(f"π Added contextual chunk for {result_id}: {ctx_id}")
|
| 180 |
+
|
| 181 |
+
if VERBOSE_LOGGING:
|
| 182 |
+
primary_count = sum(1 for r in expanded_results if r.get("is_primary_result", False))
|
| 183 |
+
context_count = len(expanded_results) - primary_count
|
| 184 |
+
print(
|
| 185 |
+
f"π Context expansion: {primary_count} primary + {context_count} contextual = {len(expanded_results)} total")
|
| 186 |
+
|
| 187 |
+
return expanded_results
|
| 188 |
+
|
| 189 |
+
def group_related_content(self, results: List[Dict]) -> Dict[str, List[Dict]]:
|
| 190 |
+
"""Group results by their relationships (same document, similar metadata, etc.)"""
|
| 191 |
+
groups = defaultdict(list)
|
| 192 |
+
|
| 193 |
+
for result in results:
|
| 194 |
+
metadata = result["metadata"]
|
| 195 |
+
|
| 196 |
+
# Create grouping key based on metadata
|
| 197 |
+
group_key_parts = []
|
| 198 |
+
|
| 199 |
+
if metadata.get('program'):
|
| 200 |
+
group_key_parts.append(f"prog_{metadata['program']}")
|
| 201 |
+
if metadata.get('year'):
|
| 202 |
+
group_key_parts.append(f"year_{metadata['year']}")
|
| 203 |
+
if metadata.get('semester'):
|
| 204 |
+
group_key_parts.append(f"sem_{metadata['semester']}")
|
| 205 |
+
if metadata.get('chapter'):
|
| 206 |
+
group_key_parts.append(f"ch_{metadata['chapter']}")
|
| 207 |
+
if metadata.get('content_type'):
|
| 208 |
+
group_key_parts.append(f"type_{metadata['content_type']}")
|
| 209 |
+
|
| 210 |
+
group_key = "_".join(group_key_parts) if group_key_parts else "general"
|
| 211 |
+
groups[group_key].append(result)
|
| 212 |
+
|
| 213 |
+
return dict(groups)
|
| 214 |
+
|
| 215 |
+
def retrieve_multimodal_context_enhanced(self, query_context: Dict[str, Any], k: int = 10) -> List[Dict]:
|
| 216 |
+
"""Enhanced retrieval with context expansion"""
|
| 217 |
+
all_results = []
|
| 218 |
+
content_strategies = {}
|
| 219 |
+
|
| 220 |
+
for content_type, ratio in CONTENT_TYPE_STRATEGIES.items():
|
| 221 |
+
content_strategies[content_type] = max(1, int(k * ratio))
|
| 222 |
+
|
| 223 |
+
if LOG_RETRIEVAL_DETAILS:
|
| 224 |
+
print(f"π― Content strategies: {content_strategies}")
|
| 225 |
+
print(f"π
Searching years: {query_context['years']}")
|
| 226 |
+
|
| 227 |
+
# Step 1: Get initial results
|
| 228 |
+
for year in query_context["years"]:
|
| 229 |
+
if year not in self.VALID_YEARS:
|
| 230 |
+
print(f"β οΈ Skipping invalid year: {year}")
|
| 231 |
+
continue
|
| 232 |
+
|
| 233 |
+
try:
|
| 234 |
+
if query_context.get("preferred_content_types"):
|
| 235 |
+
for content_type in query_context["preferred_content_types"]:
|
| 236 |
+
results = self.vectorizer.query_multimodal(
|
| 237 |
+
query_text=query_context["cleaned_query"],
|
| 238 |
+
year=year,
|
| 239 |
+
content_types=[content_type],
|
| 240 |
+
n_results=content_strategies.get(content_type, k//4)
|
| 241 |
+
)
|
| 242 |
+
for result in results:
|
| 243 |
+
result["search_year"] = year
|
| 244 |
+
result["content_priority"] = True
|
| 245 |
+
all_results.extend(results)
|
| 246 |
+
|
| 247 |
+
remaining_k = max(1, k - len(all_results))
|
| 248 |
+
general_results = self.vectorizer.query_multimodal(
|
| 249 |
+
query_text=query_context["cleaned_query"],
|
| 250 |
+
year=year,
|
| 251 |
+
content_types=None,
|
| 252 |
+
n_results=remaining_k
|
| 253 |
+
)
|
| 254 |
+
for result in general_results:
|
| 255 |
+
result["search_year"] = year
|
| 256 |
+
result["content_priority"] = False
|
| 257 |
+
all_results.extend(general_results)
|
| 258 |
+
|
| 259 |
+
except Exception as e:
|
| 260 |
+
print(f"β Error retrieving from year {year}: {e}")
|
| 261 |
+
|
| 262 |
+
# Step 2: deduplikasi
|
| 263 |
+
unique_results = self._deduplicate_and_rank_results(all_results, k)
|
| 264 |
+
|
| 265 |
+
# Step 3: Mencari konteks diluar dengana meta
|
| 266 |
+
expanded_results = self.expand_context_for_results(unique_results)
|
| 267 |
+
|
| 268 |
+
# Step 4: Final ranking and limiting
|
| 269 |
+
final_results = self._final_ranking_with_context(expanded_results, k * 2) # Allow more results due to context
|
| 270 |
+
|
| 271 |
+
if VERBOSE_LOGGING:
|
| 272 |
+
print(f"π Final results with context: {len(final_results)}")
|
| 273 |
+
|
| 274 |
+
return final_results
|
| 275 |
+
|
| 276 |
+
def _final_ranking_with_context(self, results: List[Dict], max_results: int) -> List[Dict]:
|
| 277 |
+
"""Final ranking that considers both primary results and their context"""
|
| 278 |
+
# Separate primary and contextual results
|
| 279 |
+
primary_results = [r for r in results if r.get("is_primary_result", True)]
|
| 280 |
+
contextual_results = [r for r in results if not r.get("is_primary_result", True)]
|
| 281 |
+
|
| 282 |
+
# Sort primary results by score
|
| 283 |
+
primary_results.sort(key=lambda x: x.get("score", 0), reverse=True)
|
| 284 |
+
|
| 285 |
+
# For each primary result, add its best contextual chunks
|
| 286 |
+
final_results = []
|
| 287 |
+
for primary in primary_results:
|
| 288 |
+
if len(final_results) >= max_results:
|
| 289 |
+
break
|
| 290 |
+
|
| 291 |
+
final_results.append(primary)
|
| 292 |
+
|
| 293 |
+
# Add related contextual chunks
|
| 294 |
+
primary_id = primary["metadata"].get("id", "")
|
| 295 |
+
related_contexts = [
|
| 296 |
+
r for r in contextual_results
|
| 297 |
+
if r.get("parent_result_id") == primary_id
|
| 298 |
+
]
|
| 299 |
+
|
| 300 |
+
# Sort contextual chunks by their similarity score
|
| 301 |
+
related_contexts.sort(key=lambda x: x.get("context_similarity_score", 0), reverse=True)
|
| 302 |
+
|
| 303 |
+
# Add top contextual chunks
|
| 304 |
+
for ctx in related_contexts[:2]: # Limit to 2 contextual chunks per primary
|
| 305 |
+
if len(final_results) < max_results:
|
| 306 |
+
final_results.append(ctx)
|
| 307 |
+
|
| 308 |
+
return final_results
|
| 309 |
+
|
| 310 |
+
def format_enhanced_context_with_grouping(self, results: List[Dict]) -> str:
|
| 311 |
+
"""Format context with grouping and relationship indicators"""
|
| 312 |
+
if not results:
|
| 313 |
+
return "Tidak ada informasi yang relevan ditemukan."
|
| 314 |
+
|
| 315 |
+
# Group related content
|
| 316 |
+
grouped_results = self.group_related_content(results)
|
| 317 |
+
context_parts = []
|
| 318 |
+
|
| 319 |
+
for group_key, group_results in grouped_results.items():
|
| 320 |
+
context_parts.append(f"\n{'='*60}")
|
| 321 |
+
context_parts.append(f"π GRUP: {group_key.replace('_', ' ').upper()}")
|
| 322 |
+
context_parts.append(f"{'='*60}")
|
| 323 |
+
|
| 324 |
+
for i, result in enumerate(group_results, 1):
|
| 325 |
+
content_type = result["metadata"]["content_type"]
|
| 326 |
+
is_primary = result.get("is_primary_result", True)
|
| 327 |
+
|
| 328 |
+
# Add indicator for primary vs contextual
|
| 329 |
+
result_type = "π― PRIMARY" if is_primary else "π CONTEXT"
|
| 330 |
+
|
| 331 |
+
# Enhanced formatting based on content type
|
| 332 |
+
if content_type == "table":
|
| 333 |
+
context_part = self.enhance_table_context_with_markdown(result)
|
| 334 |
+
elif content_type == "image":
|
| 335 |
+
context_part = self.enhance_image_context_with_details(result)
|
| 336 |
+
elif content_type == "silabus":
|
| 337 |
+
context_part = self.enhance_silabus_context_detailed(result)
|
| 338 |
+
elif content_type == "curriculum":
|
| 339 |
+
context_part = self.enhance_curriculum_context_detailed(result)
|
| 340 |
+
elif content_type == "text_chunk":
|
| 341 |
+
context_part = self.enhance_text_context_detailed(result)
|
| 342 |
+
else:
|
| 343 |
+
context_part = f"""
|
| 344 |
+
**KONTEN {content_type.upper()}:**
|
| 345 |
+
- **Tahun:** {result["metadata"].get('year', 'N/A')}
|
| 346 |
+
- **Halaman:** {result["metadata"].get('page', 'N/A')}
|
| 347 |
+
- **Context:** {result.get('context_text', '')[:200]}...
|
| 348 |
+
|
| 349 |
+
**Konten:**
|
| 350 |
+
{result['content'][:500]}...
|
| 351 |
+
"""
|
| 352 |
+
|
| 353 |
+
header = f"**{result_type} SUMBER {i}:**"
|
| 354 |
+
if not is_primary:
|
| 355 |
+
similarity_score = result.get("context_similarity_score", 0)
|
| 356 |
+
header += f" (Similarity: {similarity_score:.2f})"
|
| 357 |
+
|
| 358 |
+
context_parts.append(f"{header}\n{context_part}")
|
| 359 |
+
|
| 360 |
+
return "\n\n".join(context_parts)
|
| 361 |
+
|
| 362 |
+
def _deduplicate_and_rank_results(self, all_results: List[Dict], k: int) -> List[Dict]:
|
| 363 |
+
seen_ids = set()
|
| 364 |
+
unique_results = []
|
| 365 |
+
sorted_results = sorted(
|
| 366 |
+
all_results,
|
| 367 |
+
key=lambda x: (x.get("score", 0), not x.get("content_priority", False))
|
| 368 |
+
)
|
| 369 |
+
|
| 370 |
+
content_type_counts = {}
|
| 371 |
+
max_per_type = max(1, k // len(CONTENT_TYPE_STRATEGIES))
|
| 372 |
+
|
| 373 |
+
for result in sorted_results:
|
| 374 |
+
result_id = result["metadata"].get("id", "")
|
| 375 |
+
content_type = result["metadata"]["content_type"]
|
| 376 |
+
|
| 377 |
+
# Skip duplicates
|
| 378 |
+
if result_id in seen_ids:
|
| 379 |
+
continue
|
| 380 |
+
|
| 381 |
+
# Limit per content type for diversity (unless priority content)
|
| 382 |
+
if not result.get("content_priority", False):
|
| 383 |
+
if content_type_counts.get(content_type, 0) >= max_per_type:
|
| 384 |
+
continue
|
| 385 |
+
|
| 386 |
+
seen_ids.add(result_id)
|
| 387 |
+
content_type_counts[content_type] = content_type_counts.get(content_type, 0) + 1
|
| 388 |
+
|
| 389 |
+
# Enhance with context_text
|
| 390 |
+
if "context_text" not in result:
|
| 391 |
+
result["context_text"] = result["metadata"].get("context_text", "")
|
| 392 |
+
|
| 393 |
+
unique_results.append(result)
|
| 394 |
+
|
| 395 |
+
if len(unique_results) >= k:
|
| 396 |
+
break
|
| 397 |
+
|
| 398 |
+
return unique_results
|
| 399 |
+
|
| 400 |
+
def enhance_table_context_with_markdown(self, result: Dict) -> str:
|
| 401 |
+
"""Enhanced table context with markdown formatting"""
|
| 402 |
+
metadata = result["metadata"]
|
| 403 |
+
context_text = result.get("context_text", "")
|
| 404 |
+
|
| 405 |
+
enhanced_context = f"""
|
| 406 |
+
**TABEL ENHANCED:**
|
| 407 |
+
- **Judul:** {metadata.get('title', 'N/A')}
|
| 408 |
+
- **Ukuran:** {metadata.get('rows', 0)} baris Γ {metadata.get('cols', 0)} kolom
|
| 409 |
+
- **Tahun:** {metadata.get('year', 'N/A')}
|
| 410 |
+
- **Halaman:** {metadata.get('page', 'N/A')}
|
| 411 |
+
- **Context:** {context_text}
|
| 412 |
+
- **Preview:** {result['content'][:300]}...
|
| 413 |
+
|
| 414 |
+
**Konten Lengkap:**
|
| 415 |
+
{result['content']}
|
| 416 |
+
"""
|
| 417 |
+
return enhanced_context
|
| 418 |
+
|
| 419 |
+
def enhance_image_context_with_details(self, result: Dict) -> str:
|
| 420 |
+
"""Enhanced image context with detailed metadata"""
|
| 421 |
+
metadata = result["metadata"]
|
| 422 |
+
context_text = result.get("context_text", "")
|
| 423 |
+
|
| 424 |
+
enhanced_context = f"""
|
| 425 |
+
**GAMBAR ENHANCED:**
|
| 426 |
+
- **Judul:** {metadata.get('title', 'N/A')}
|
| 427 |
+
- **Caption:** {metadata.get('caption', 'N/A')}
|
| 428 |
+
- **Tahun:** {metadata.get('year', 'N/A')}
|
| 429 |
+
- **Halaman:** {metadata.get('page', 'N/A')}
|
| 430 |
+
- **Context:** {context_text}
|
| 431 |
+
- **Deskripsi:** {result['content'][:300]}...
|
| 432 |
+
|
| 433 |
+
**Path Gambar:** {metadata.get('image_path', 'N/A')}
|
| 434 |
+
"""
|
| 435 |
+
return enhanced_context
|
| 436 |
+
|
| 437 |
+
def enhance_silabus_context_detailed(self, result: Dict) -> str:
|
| 438 |
+
"""Enhanced silabus context with comprehensive details"""
|
| 439 |
+
metadata = result["metadata"]
|
| 440 |
+
context_text = result.get("context_text", "")
|
| 441 |
+
|
| 442 |
+
enhanced_context = f"""
|
| 443 |
+
**SILABUS ENHANCED:**
|
| 444 |
+
- **Mata Kuliah:** {metadata.get('mata_kuliah', 'N/A')} ({metadata.get('course_code', 'N/A')})
|
| 445 |
+
- **Program Studi:** {metadata.get('program', 'N/A').title()}
|
| 446 |
+
- **Semester:** {metadata.get('semester', 'N/A')}
|
| 447 |
+
- **SKS:** {metadata.get('sks', 'N/A')}
|
| 448 |
+
- **Tipe Silabus:** {metadata.get('silabus_type', 'N/A')}
|
| 449 |
+
- **Tahun Kurikulum:** {metadata.get('year', 'N/A')}
|
| 450 |
+
- **Halaman:** {metadata.get('page', 'N/A')}
|
| 451 |
+
- **Context Text:** {context_text}
|
| 452 |
+
|
| 453 |
+
**Konten Lengkap:**
|
| 454 |
+
{result['content']}
|
| 455 |
+
"""
|
| 456 |
+
return enhanced_context
|
| 457 |
+
|
| 458 |
+
def enhance_curriculum_context_detailed(self, result: Dict) -> str:
|
| 459 |
+
"""Enhanced curriculum context with comprehensive details"""
|
| 460 |
+
metadata = result["metadata"]
|
| 461 |
+
context_text = result.get("context_text", "")
|
| 462 |
+
|
| 463 |
+
enhanced_context = f"""
|
| 464 |
+
**KURIKULUM ENHANCED:**
|
| 465 |
+
- **Program Studi:** {metadata.get('program', 'N/A').title()}
|
| 466 |
+
- **Semester:** {metadata.get('semester', 'N/A')}
|
| 467 |
+
- **Jenis Tabel:** {metadata.get('table_type', 'N/A')}
|
| 468 |
+
- **Jumlah Mata Kuliah:** {metadata.get('rows_count', 'N/A')}
|
| 469 |
+
- **Tahun Kurikulum:** {metadata.get('year', 'N/A')}
|
| 470 |
+
- **Halaman:** {metadata.get('page', 'N/A')}
|
| 471 |
+
- **Context Text:** {context_text}
|
| 472 |
+
|
| 473 |
+
**Konten Lengkap:**
|
| 474 |
+
{result['content']}
|
| 475 |
+
"""
|
| 476 |
+
return enhanced_context
|
| 477 |
+
|
| 478 |
+
def enhance_text_context_detailed(self, result: Dict) -> str:
|
| 479 |
+
"""Enhanced text context with comprehensive details"""
|
| 480 |
+
metadata = result["metadata"]
|
| 481 |
+
context_text = result.get("context_text", "")
|
| 482 |
+
|
| 483 |
+
enhanced_context = f"""
|
| 484 |
+
**TEKS ENHANCED:**
|
| 485 |
+
- **Bab:** {metadata.get('chapter', 'N/A')}
|
| 486 |
+
- **Bagian:** {metadata.get('section', 'N/A')}
|
| 487 |
+
- **Sub-bagian:** {metadata.get('subsection', 'N/A')}
|
| 488 |
+
- **Tahun:** {metadata.get('year', 'N/A')}
|
| 489 |
+
- **Halaman:** {metadata.get('page', 'N/A')}
|
| 490 |
+
- **Context Text:** {context_text}
|
| 491 |
+
|
| 492 |
+
**Konten Lengkap:**
|
| 493 |
+
{result['content']}
|
| 494 |
+
"""
|
| 495 |
+
return enhanced_context
|
| 496 |
+
|
| 497 |
+
def format_enhanced_context(self, results: List[Dict]) -> str:
|
| 498 |
+
"""Format context with comprehensive enhancements and grouping"""
|
| 499 |
+
return self.format_enhanced_context_with_grouping(results)
|
| 500 |
+
|
| 501 |
+
def generate_response(self, query: str, context: str, chat_history: List[Dict] = None) -> str:
|
| 502 |
+
"""Generate response using LLM with context and chat history"""
|
| 503 |
+
|
| 504 |
+
# Prepare chat history context
|
| 505 |
+
chat_history_text = ""
|
| 506 |
+
if chat_history and len(chat_history) > 1:
|
| 507 |
+
recent_messages = chat_history[-CONTEXT_WINDOW_SIZE:]
|
| 508 |
+
chat_history_text = "\n\nRiwayat Percakapan Terakhir:\n"
|
| 509 |
+
for msg in recent_messages[:-1]: # Exclude current message
|
| 510 |
+
role = "User" if msg["role"] == "user" else "Assistant"
|
| 511 |
+
chat_history_text += f"{role}: {msg['content'][:200]}...\n"
|
| 512 |
+
|
| 513 |
+
# Enhanced prompt
|
| 514 |
+
enhanced_prompt = f"""
|
| 515 |
+
Anda adalah asisten akademik DTMI UGM yang membantu mahasiswa dan dosen.
|
| 516 |
+
|
| 517 |
+
{chat_history_text}
|
| 518 |
+
|
| 519 |
+
Pertanyaan Saat Ini: {query}
|
| 520 |
+
|
| 521 |
+
Konteks Informasi:
|
| 522 |
+
{context}
|
| 523 |
+
|
| 524 |
+
Instruksi:
|
| 525 |
+
1. Berikan jawaban yang komprehensif dan akurat
|
| 526 |
+
2. Gunakan informasi dari konteks yang relevan
|
| 527 |
+
3. Jika merujuk ke tahun atau program studi, sebutkan secara spesifik
|
| 528 |
+
4. Format jawaban dengan struktur yang jelas (gunakan bullet points, numbering jika perlu)
|
| 529 |
+
5. Jika ada tabel atau data, jelaskan dengan detail
|
| 530 |
+
6. Akhiri dengan saran atau informasi tambahan yang berguna
|
| 531 |
+
7. Pertimbangkan konteks percakapan sebelumnya jika relevan
|
| 532 |
+
8. Manfaatkan informasi kontekstual yang tersedia untuk memberikan jawaban yang lebih lengkap
|
| 533 |
+
|
| 534 |
+
Jawaban:
|
| 535 |
+
"""
|
| 536 |
+
for attempt in range(MAX_RETRIES):
|
| 537 |
+
try:
|
| 538 |
+
response = self.llm.predict(enhanced_prompt)
|
| 539 |
+
return response
|
| 540 |
+
except Exception as e:
|
| 541 |
+
if attempt == MAX_RETRIES - 1:
|
| 542 |
+
return FALLBACK_RESPONSE
|
| 543 |
+
else:
|
| 544 |
+
import time
|
| 545 |
+
time.sleep(RETRY_DELAY)
|
| 546 |
+
|
| 547 |
+
return FALLBACK_RESPONSE
|
| 548 |
+
|
| 549 |
+
def parse_query_context(self, query: str) -> Dict[str, Any]:
|
| 550 |
+
"""Parse query context with year extraction and content type detection"""
|
| 551 |
+
years, cleaned_query, user_mentioned_year, user_mentioned_invalid_year = self.year_parser.extract_years(query)
|
| 552 |
+
comparison_keywords = ["bandingkan", "banding", "perbandingan",
|
| 553 |
+
"dibanding", "vs", "versus", "perbedaan"]
|
| 554 |
+
year_comparison_mode = any(keyword in cleaned_query.lower()
|
| 555 |
+
for keyword in comparison_keywords) and len(years) > 1
|
| 556 |
+
|
| 557 |
+
content_type_hints = {
|
| 558 |
+
"silabus": ["silabus", "mata kuliah", "course", "sks", "pembelajaran", "materi"],
|
| 559 |
+
"curriculum": ["kurikulum", "curriculum", "semester", "program studi", "struktur"],
|
| 560 |
+
"table": ["tabel", "table", "data", "statistik", "daftar", "distribusi"],
|
| 561 |
+
"image": ["gambar", "image", "foto", "diagram", "struktur", "chart"],
|
| 562 |
+
"text_chunk": ["informasi", "penjelasan", "deskripsi", "detail", "tentang"]
|
| 563 |
+
}
|
| 564 |
+
preferred_types = []
|
| 565 |
+
query_lower = cleaned_query.lower()
|
| 566 |
+
|
| 567 |
+
for content_type, keywords in content_type_hints.items():
|
| 568 |
+
if any(keyword in query_lower for keyword in keywords):
|
| 569 |
+
preferred_types.append(content_type)
|
| 570 |
+
|
| 571 |
+
return {
|
| 572 |
+
"original_query": query,
|
| 573 |
+
"cleaned_query": cleaned_query,
|
| 574 |
+
"years": years,
|
| 575 |
+
"preferred_content_types": preferred_types,
|
| 576 |
+
"year_comparison_mode": year_comparison_mode
|
| 577 |
+
}
|
| 578 |
+
|
| 579 |
+
def query(self, question: str, k: int = 10, content_filter: List[str] = None) -> Dict[str, Any]:
|
| 580 |
+
years, cleaned_query, user_mentioned_year, user_mentioned_invalid_year = self.year_parser.extract_years(
|
| 581 |
+
question)
|
| 582 |
+
if user_mentioned_invalid_year and not years:
|
| 583 |
+
return {
|
| 584 |
+
"question": question,
|
| 585 |
+
"answer": "Maaf, informasi mengenai kurikulum tahun yang Anda minta tidak tersedia dalam konteks database ini.",
|
| 586 |
+
"context": "",
|
| 587 |
+
"sources": [],
|
| 588 |
+
"primary_sources": [],
|
| 589 |
+
"contextual_sources": [],
|
| 590 |
+
"years_searched": [],
|
| 591 |
+
"content_types_used": [],
|
| 592 |
+
"total_sources": 0,
|
| 593 |
+
"primary_sources_count": 0,
|
| 594 |
+
"contextual_sources_count": 0,
|
| 595 |
+
"has_images": False,
|
| 596 |
+
"has_tables": False,
|
| 597 |
+
"image_data": [],
|
| 598 |
+
"table_data": [],
|
| 599 |
+
"image_paths": [],
|
| 600 |
+
"table_paths": [],
|
| 601 |
+
"year_comparison_mode": False,
|
| 602 |
+
"context_expansion_enabled": self.CONTEXT_EXPANSION_ENABLED,
|
| 603 |
+
"processing_time": datetime.now().isoformat()
|
| 604 |
+
}
|
| 605 |
+
if VERBOSE_LOGGING:
|
| 606 |
+
print(f"π Processing query: {question}")
|
| 607 |
+
query_context = self.parse_query_context(question)
|
| 608 |
+
if content_filter:
|
| 609 |
+
query_context["preferred_content_types"] = content_filter
|
| 610 |
+
if LOG_RETRIEVAL_DETAILS:
|
| 611 |
+
print(f"π
Years: {query_context['years']}")
|
| 612 |
+
print(f"π― Content types: {query_context['preferred_content_types']}")
|
| 613 |
+
print(f"π Content filter: {content_filter}")
|
| 614 |
+
results = self.retrieve_multimodal_context_enhanced(query_context, k)
|
| 615 |
+
context = self.format_enhanced_context(results)
|
| 616 |
+
try:
|
| 617 |
+
response = self.generate_response(question, context)
|
| 618 |
+
except Exception as e:
|
| 619 |
+
print(f"β Error generating answer: {e}")
|
| 620 |
+
response = FALLBACK_RESPONSE
|
| 621 |
+
image_data = []
|
| 622 |
+
table_data = []
|
| 623 |
+
|
| 624 |
+
for result in results:
|
| 625 |
+
metadata = result["metadata"]
|
| 626 |
+
content_type = metadata.get("content_type", "")
|
| 627 |
+
|
| 628 |
+
# β
FILTER: HANYA AMBIL YANG PRIMARY SOURCES
|
| 629 |
+
is_primary = result.get("is_primary_result", True)
|
| 630 |
+
if not is_primary:
|
| 631 |
+
continue # Skip contextual sources
|
| 632 |
+
|
| 633 |
+
# πΌοΈ EXTRACT IMAGE INFORMATION - HANYA PRIMARY
|
| 634 |
+
if content_type == "image":
|
| 635 |
+
original_image_path = metadata.get("image_path", "")
|
| 636 |
+
if original_image_path:
|
| 637 |
+
# Path fixing logic (sama seperti sebelumnya)
|
| 638 |
+
fixed_path = original_image_path
|
| 639 |
+
if fixed_path.startswith("./src/"):
|
| 640 |
+
fixed_path = fixed_path.replace("./src/", "./")
|
| 641 |
+
elif fixed_path.startswith("src/"):
|
| 642 |
+
fixed_path = fixed_path.replace("src/", "./")
|
| 643 |
+
|
| 644 |
+
if os.path.exists(fixed_path):
|
| 645 |
+
image_path = fixed_path
|
| 646 |
+
elif os.path.exists(original_image_path):
|
| 647 |
+
image_path = original_image_path
|
| 648 |
+
else:
|
| 649 |
+
alternatives = [
|
| 650 |
+
original_image_path.lstrip('./'),
|
| 651 |
+
f"../{original_image_path.lstrip('./')}",
|
| 652 |
+
original_image_path.replace("./src/", "../")
|
| 653 |
+
]
|
| 654 |
+
image_path = None
|
| 655 |
+
for alt in alternatives:
|
| 656 |
+
if os.path.exists(alt):
|
| 657 |
+
image_path = alt
|
| 658 |
+
break
|
| 659 |
+
|
| 660 |
+
if not image_path:
|
| 661 |
+
image_path = original_image_path
|
| 662 |
+
|
| 663 |
+
if VERBOSE_LOGGING:
|
| 664 |
+
print(f"πΌοΈ PRIMARY Image path resolution:")
|
| 665 |
+
print(f" Original: {original_image_path}")
|
| 666 |
+
print(f" Fixed: {image_path}")
|
| 667 |
+
print(f" Exists: {os.path.exists(image_path)}")
|
| 668 |
+
|
| 669 |
+
image_info = {
|
| 670 |
+
"path": image_path,
|
| 671 |
+
"original_path": original_image_path,
|
| 672 |
+
"title": metadata.get("title", "Gambar"),
|
| 673 |
+
"caption": metadata.get("caption", result['content'][:100] + "..."),
|
| 674 |
+
"page": metadata.get("page", "N/A"),
|
| 675 |
+
"year": metadata.get("year", "N/A"),
|
| 676 |
+
"description": result['content'][:200] + "..." if len(result['content']) > 200 else result['content'],
|
| 677 |
+
"score": result.get("score", 0.0),
|
| 678 |
+
"is_primary": True # Semua yang masuk ke sini adalah primary
|
| 679 |
+
}
|
| 680 |
+
image_data.append(image_info)
|
| 681 |
+
if VERBOSE_LOGGING:
|
| 682 |
+
print(f"πΌοΈ Added PRIMARY image: {image_path}")
|
| 683 |
+
|
| 684 |
+
# π EXTRACT TABLE INFORMATION - HANYA PRIMARY
|
| 685 |
+
elif content_type == "table":
|
| 686 |
+
table_path = metadata.get("table_path", "")
|
| 687 |
+
if table_path and os.path.exists(table_path):
|
| 688 |
+
try:
|
| 689 |
+
table_info = {
|
| 690 |
+
"path": table_path,
|
| 691 |
+
"title": metadata.get("title", "Tabel"),
|
| 692 |
+
"page": metadata.get("page", "N/A"),
|
| 693 |
+
"year": metadata.get("year", "N/A"),
|
| 694 |
+
"rows": metadata.get("rows", 0),
|
| 695 |
+
"cols": metadata.get("cols", 0),
|
| 696 |
+
"description": result['content'][:200] + "..." if len(result['content']) > 200 else result['content'],
|
| 697 |
+
"score": result.get("score", 0.0),
|
| 698 |
+
"is_primary": True # Semua yang masuk ke sini adalah primary
|
| 699 |
+
}
|
| 700 |
+
|
| 701 |
+
# Load actual table data
|
| 702 |
+
if table_path.endswith('.csv'):
|
| 703 |
+
df = pd.read_csv(table_path)
|
| 704 |
+
table_info["data"] = df
|
| 705 |
+
table_info["data_type"] = "dataframe"
|
| 706 |
+
elif table_path.endswith('.json'):
|
| 707 |
+
with open(table_path, 'r', encoding='utf-8') as f:
|
| 708 |
+
json_data = json.load(f)
|
| 709 |
+
table_info["data"] = json_data
|
| 710 |
+
table_info["data_type"] = "json"
|
| 711 |
+
|
| 712 |
+
table_data.append(table_info)
|
| 713 |
+
if VERBOSE_LOGGING:
|
| 714 |
+
print(f"π Found PRIMARY table: {table_path}")
|
| 715 |
+
|
| 716 |
+
except Exception as e:
|
| 717 |
+
print(f"β Error loading table {table_path}: {e}")
|
| 718 |
+
primary_results = [r for r in results if r.get("is_primary_result", True)]
|
| 719 |
+
contextual_results = [r for r in results if not r.get("is_primary_result", True)]
|
| 720 |
+
response_data = {
|
| 721 |
+
"question": question,
|
| 722 |
+
"answer": response.strip(),
|
| 723 |
+
"context": context,
|
| 724 |
+
"sources": results,
|
| 725 |
+
"primary_sources": primary_results,
|
| 726 |
+
"contextual_sources": contextual_results,
|
| 727 |
+
"years_searched": query_context["years"],
|
| 728 |
+
"content_types_used": query_context["preferred_content_types"],
|
| 729 |
+
"total_sources": len(results),
|
| 730 |
+
"primary_sources_count": len(primary_results),
|
| 731 |
+
"contextual_sources_count": len(contextual_results),
|
| 732 |
+
"has_images": len(image_data) > 0,
|
| 733 |
+
"has_tables": len(table_data) > 0,
|
| 734 |
+
"image_data": image_data, # Full image metadata dengan path, title, etc
|
| 735 |
+
"table_data": table_data, # Loaded table data dengan DataFrame/JSON
|
| 736 |
+
"image_paths": [img["path"] for img in image_data],
|
| 737 |
+
"table_paths": [tbl["path"] for tbl in table_data],
|
| 738 |
+
"year_comparison_mode": query_context["year_comparison_mode"],
|
| 739 |
+
"context_expansion_enabled": self.CONTEXT_EXPANSION_ENABLED,
|
| 740 |
+
"processing_time": datetime.now().isoformat()
|
| 741 |
+
}
|
| 742 |
+
|
| 743 |
+
if VERBOSE_LOGGING:
|
| 744 |
+
print(f"β
Query processed successfully")
|
| 745 |
+
print(f"π― Primary sources: {len(primary_results)}")
|
| 746 |
+
print(f"π Contextual sources: {len(contextual_results)}")
|
| 747 |
+
print(f"πΌοΈ Images found: {len(image_data)}")
|
| 748 |
+
print(f"π Tables found: {len(table_data)}")
|
| 749 |
+
return response_data
|
| 750 |
+
|
| 751 |
+
def get_context_chain(self, result_id: str, max_depth: int = 3) -> List[Dict]:
|
| 752 |
+
"""Get a chain of contextually related chunks starting from a specific result"""
|
| 753 |
+
try:
|
| 754 |
+
# This would work with your vectorstore to find chunks with similar metadata
|
| 755 |
+
# Implementation depends on your vectorstore structure
|
| 756 |
+
chain = []
|
| 757 |
+
current_id = result_id
|
| 758 |
+
|
| 759 |
+
for depth in range(max_depth):
|
| 760 |
+
# Find chunks with similar metadata to current chunk
|
| 761 |
+
similar_chunks = self.vectorizer.find_similar_by_metadata(current_id)
|
| 762 |
+
if not similar_chunks:
|
| 763 |
+
break
|
| 764 |
+
|
| 765 |
+
# Add the most similar chunk to chain
|
| 766 |
+
best_match = similar_chunks[0]
|
| 767 |
+
chain.append(best_match)
|
| 768 |
+
current_id = best_match["metadata"]["id"]
|
| 769 |
+
|
| 770 |
+
return chain
|
| 771 |
+
|
| 772 |
+
except Exception as e:
|
| 773 |
+
print(f"β Error building context chain: {e}")
|
| 774 |
+
return []
|
| 775 |
+
|
| 776 |
+
def get_full_document_context(self, metadata: Dict, year: int) -> str:
|
| 777 |
+
"""Get comprehensive context from the entire document/source"""
|
| 778 |
+
try:
|
| 779 |
+
# Build document identifier
|
| 780 |
+
doc_identifiers = []
|
| 781 |
+
|
| 782 |
+
if metadata.get('program'):
|
| 783 |
+
doc_identifiers.append(metadata['program'])
|
| 784 |
+
if metadata.get('year'):
|
| 785 |
+
doc_identifiers.append(str(metadata['year']))
|
| 786 |
+
if metadata.get('chapter'):
|
| 787 |
+
doc_identifiers.append(metadata['chapter'])
|
| 788 |
+
|
| 789 |
+
# Search for all chunks from the same document
|
| 790 |
+
doc_query = " ".join(doc_identifiers)
|
| 791 |
+
|
| 792 |
+
# Get broader context
|
| 793 |
+
doc_chunks = self.vectorizer.query_multimodal(
|
| 794 |
+
query_text=doc_query,
|
| 795 |
+
year=year,
|
| 796 |
+
content_types=None,
|
| 797 |
+
n_results=50 # Get many chunks from same document
|
| 798 |
+
)
|
| 799 |
+
|
| 800 |
+
# Filter chunks that are actually from the same document
|
| 801 |
+
same_doc_chunks = []
|
| 802 |
+
for chunk in doc_chunks:
|
| 803 |
+
chunk_meta = chunk["metadata"]
|
| 804 |
+
similarity_score = self.get_metadata_similarity_score(metadata, chunk_meta)
|
| 805 |
+
if similarity_score > 0.5: # Adjust threshold as needed
|
| 806 |
+
same_doc_chunks.append(chunk)
|
| 807 |
+
|
| 808 |
+
# Sort by page number or similarity
|
| 809 |
+
same_doc_chunks.sort(key=lambda x: (
|
| 810 |
+
x["metadata"].get("page", 999),
|
| 811 |
+
x.get("score", 0)
|
| 812 |
+
))
|
| 813 |
+
|
| 814 |
+
# Combine content with clear separators
|
| 815 |
+
full_context = ""
|
| 816 |
+
for i, chunk in enumerate(same_doc_chunks[:10]): # Limit to avoid token overflow
|
| 817 |
+
page = chunk["metadata"].get("page", "N/A")
|
| 818 |
+
content_type = chunk["metadata"].get("content_type", "unknown")
|
| 819 |
+
full_context += f"\n--- {content_type.upper()} (Page {page}) ---\n"
|
| 820 |
+
full_context += chunk["content"][:500] + "...\n"
|
| 821 |
+
|
| 822 |
+
return full_context
|
| 823 |
+
|
| 824 |
+
except Exception as e:
|
| 825 |
+
print(f"β Error getting full document context: {e}")
|
| 826 |
+
return ""
|
| 827 |
+
|
| 828 |
+
def advanced_context_retrieval(self, query_context: Dict[str, Any], k: int = 10) -> List[Dict]:
|
| 829 |
+
"""Advanced retrieval that considers document structure and relationships"""
|
| 830 |
+
|
| 831 |
+
# Step 1: Get initial high-quality results
|
| 832 |
+
initial_results = self.retrieve_multimodal_context_enhanced(query_context, k//2)
|
| 833 |
+
|
| 834 |
+
# Step 2: For each high-quality result, get its document context
|
| 835 |
+
enhanced_results = []
|
| 836 |
+
seen_ids = set()
|
| 837 |
+
|
| 838 |
+
for result in initial_results:
|
| 839 |
+
result_id = result["metadata"].get("id", "")
|
| 840 |
+
if result_id in seen_ids:
|
| 841 |
+
continue
|
| 842 |
+
|
| 843 |
+
seen_ids.add(result_id)
|
| 844 |
+
result["context_level"] = "primary"
|
| 845 |
+
enhanced_results.append(result)
|
| 846 |
+
|
| 847 |
+
# Get document-level context
|
| 848 |
+
year = result.get("search_year", result["metadata"].get("year"))
|
| 849 |
+
if year:
|
| 850 |
+
doc_context = self.get_full_document_context(result["metadata"], year)
|
| 851 |
+
if doc_context:
|
| 852 |
+
# Create a synthetic result with full document context
|
| 853 |
+
doc_result = {
|
| 854 |
+
"content": doc_context,
|
| 855 |
+
"metadata": {
|
| 856 |
+
**result["metadata"],
|
| 857 |
+
"content_type": "document_context",
|
| 858 |
+
"id": f"{result_id}_doc_context"
|
| 859 |
+
},
|
| 860 |
+
"score": result.get("score", 0) * 0.8, # Slightly lower score
|
| 861 |
+
"context_level": "document",
|
| 862 |
+
"parent_id": result_id
|
| 863 |
+
}
|
| 864 |
+
enhanced_results.append(doc_result)
|
| 865 |
+
|
| 866 |
+
# Step 3: Fill remaining slots with diverse content
|
| 867 |
+
remaining_k = k - len(enhanced_results)
|
| 868 |
+
if remaining_k > 0:
|
| 869 |
+
additional_results = self.vectorizer.query_multimodal(
|
| 870 |
+
query_text=query_context["cleaned_query"],
|
| 871 |
+
year=query_context["years"][0] if query_context["years"] else 2024,
|
| 872 |
+
content_types=None,
|
| 873 |
+
n_results=remaining_k * 2
|
| 874 |
+
)
|
| 875 |
+
|
| 876 |
+
for add_result in additional_results:
|
| 877 |
+
add_id = add_result["metadata"].get("id", "")
|
| 878 |
+
if add_id not in seen_ids and len(enhanced_results) < k:
|
| 879 |
+
add_result["context_level"] = "supplementary"
|
| 880 |
+
enhanced_results.append(add_result)
|
| 881 |
+
seen_ids.add(add_id)
|
| 882 |
+
|
| 883 |
+
return enhanced_results[:k]
|
src/README.md
CHANGED
|
@@ -1,3 +1,5 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
|
|
|
|
|
|
|
|
| 1 |
+
python -m venv venv
|
| 2 |
+
source ./venv/bin/activate
|
| 3 |
+
pip install -r requirements
|
| 4 |
+
streamlit
|
| 5 |
+
untuk mengubah2 config bisa config.py
|
src/config.py
CHANGED
|
@@ -1,3 +1,72 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
COSINE_SIMILARITY_THRESHOLD = 0.80 # Threshold for text similarity
|
| 3 |
+
MAX_SIMILAR_CONTEXT = 5 # Maximum similar context to retrieve
|
| 4 |
+
VALID_YEARS = [2022, 2023, 2024] # Valid years for filtering
|
| 5 |
+
DEFAULT_SEARCH_YEARS = [2022, 2023, 2024] # Default if no year specified
|
| 6 |
+
DEFAULT_LLM_MODEL = "gpt-3.5-turbo"
|
| 7 |
+
LLM_TEMPERATURE = 0.1 # Low temperature for more consistent responses
|
| 8 |
+
MAX_TOKENS = 2000
|
| 9 |
+
LLM_TIMEOUT = 30
|
| 10 |
+
MAX_RETRIES = 3
|
| 11 |
+
RETRY_DELAY = 2
|
| 12 |
+
CONTENT_TYPE_STRATEGIES = {
|
| 13 |
+
"silabus": 0.3,
|
| 14 |
+
"curriculum": 0.25,
|
| 15 |
+
"table": 0.2,
|
| 16 |
+
"image": 0.1,
|
| 17 |
+
"text_chunk": 0.4
|
| 18 |
+
}
|
| 19 |
+
TABLE_MARKDOWN_CONFIG = {
|
| 20 |
+
"max_rows": 10,
|
| 21 |
+
"max_cols": 8,
|
| 22 |
+
"include_index": False,
|
| 23 |
+
"float_format": ":.2f"
|
| 24 |
+
}
|
| 25 |
+
VERBOSE_LOGGING = True
|
| 26 |
+
LOG_RETRIEVAL_DETAILS = True
|
| 27 |
+
MAX_CHAT_HISTORY = 20 # Maximum chat history to keep
|
| 28 |
+
CONTEXT_WINDOW_SIZE = 3 # Number of previous exchanges to include in context
|
| 29 |
+
FALLBACK_RESPONSE = """Maaf, terjadi kesalahan dalam menghasilkan jawaban.
|
| 30 |
+
Silakan coba dengan pertanyaan yang lebih spesifik atau hubungi administrator sistem.
|
| 31 |
+
Contoh pertanyaan yang bisa dicoba:
|
| 32 |
+
- "Mata kuliah semester 1 teknik mesin 2022"
|
| 33 |
+
- "Kurikulum teknik industri tahun 2023"
|
| 34 |
+
- "Tabel distribusi mata kuliah"
|
| 35 |
+
"""
|
| 36 |
+
CONTENT_TYPE_DESCRIPTIONS = {
|
| 37 |
+
"silabus": "π Silabus Mata Kuliah",
|
| 38 |
+
"curriculum": "π Kurikulum Program Studi",
|
| 39 |
+
"table": "π Tabel & Data",
|
| 40 |
+
"image": "πΌοΈ Gambar & Diagram",
|
| 41 |
+
"text_chunk": "π Teks Umum"
|
| 42 |
+
}
|
| 43 |
+
EXAMPLE_QUERIES = {
|
| 44 |
+
"π Data & Tabel": [
|
| 45 |
+
"Tolong carikan format Cuti kuliah Tahun 2022",
|
| 46 |
+
"Jadwal mata kuliah semester genap",
|
| 47 |
+
"Tabel mata kuliah wajib dan pilihan",
|
| 48 |
+
"Prasyarat mata kuliah desain produk 2022 "
|
| 49 |
+
],
|
| 50 |
+
"π Kurikulum & Mata Kuliah": [
|
| 51 |
+
"Mata kuliah semester 1 teknik mesin tahun 2024",
|
| 52 |
+
],
|
| 53 |
+
"π Silabus & Detail Mata Kuliah": [
|
| 54 |
+
"Silabus mata kuliah Termodinamika",
|
| 55 |
+
"Detail pembelajaran Mekanika Fluida",
|
| 56 |
+
"Prasyarat mata kuliah Perancangan Produk"
|
| 57 |
+
],
|
| 58 |
+
|
| 59 |
+
"π Perbandingan & Analisis": [
|
| 60 |
+
"Siapa Pengelola Layanan Akademik tahun 2022 ",
|
| 61 |
+
"Perbedaan kurikulum teknik mesin dan industri",
|
| 62 |
+
"Perubahan kurikulum dari 2022 ke 2024",
|
| 63 |
+
],
|
| 64 |
+
"π Perbandingan & Analisis": [
|
| 65 |
+
"Siapa Pengelola Layanan Akademik tahun 2022 ",
|
| 66 |
+
"Perbedaan kurikulum teknik mesin dan industri",
|
| 67 |
+
"Perubahan kurikulum dari 2022 ke 2024",
|
| 68 |
+
"Aoa isi kurikulum teknik mesin 2026",
|
| 69 |
+
|
| 70 |
+
]
|
| 71 |
+
|
| 72 |
+
}
|
src/streamlit_app.py
CHANGED
|
@@ -1,3 +1,521 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import os
|
| 3 |
+
from PIL import Image
|
| 4 |
+
import pandas as pd
|
| 5 |
+
import json
|
| 6 |
+
from datetime import datetime
|
| 7 |
+
from typing import List, Dict, Any, Optional
|
| 8 |
+
from RAG import EnhancedMultimodalRAGSystem
|
| 9 |
+
from config import *
|
| 10 |
+
|
| 11 |
+
# Page config
|
| 12 |
+
st.set_page_config(
|
| 13 |
+
page_title="DTMI UGM Academic Assistant",
|
| 14 |
+
page_icon="π",
|
| 15 |
+
layout="wide",
|
| 16 |
+
initial_sidebar_state="expanded"
|
| 17 |
+
)
|
| 18 |
+
|
| 19 |
+
# Enhanced CSS - ChatGPT Style
|
| 20 |
+
st.markdown("""
|
| 21 |
+
<style>
|
| 22 |
+
/* Main Header */
|
| 23 |
+
.main-header {
|
| 24 |
+
background: linear-gradient(135deg, #1e3c72 0%, #2a5298 50%, #4a90e2 100%);
|
| 25 |
+
padding: 2rem;
|
| 26 |
+
border-radius: 15px;
|
| 27 |
+
color: white;
|
| 28 |
+
text-align: center;
|
| 29 |
+
margin-bottom: 2rem;
|
| 30 |
+
box-shadow: 0 8px 32px rgba(0,0,0,0.1);
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
.main-header h1 {
|
| 34 |
+
margin-bottom: 0.5rem;
|
| 35 |
+
font-size: 2.5rem;
|
| 36 |
+
font-weight: 700;
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
.main-header p {
|
| 40 |
+
margin: 0.3rem 0;
|
| 41 |
+
opacity: 0.9;
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
/* Chat Messages - Hitam Putih Simple */
|
| 45 |
+
.user-message {
|
| 46 |
+
background: #2d2d2d;
|
| 47 |
+
color: white;
|
| 48 |
+
padding: 1.2rem;
|
| 49 |
+
border-radius: 15px;
|
| 50 |
+
margin: 1rem 0;
|
| 51 |
+
border-left: 5px solid #0084ff;
|
| 52 |
+
box-shadow: 0 4px 12px rgba(0,0,0,0.2);
|
| 53 |
+
animation: slideInRight 0.3s ease-out;
|
| 54 |
+
}
|
| 55 |
+
|
| 56 |
+
.assistant-message {
|
| 57 |
+
background: #f8f9fa;
|
| 58 |
+
color: #2d2d2d;
|
| 59 |
+
padding: 1.2rem;
|
| 60 |
+
border-radius: 15px;
|
| 61 |
+
margin: 1rem 0;
|
| 62 |
+
border-left: 5px solid #28a745;
|
| 63 |
+
box-shadow: 0 4px 12px rgba(0,0,0,0.1);
|
| 64 |
+
animation: slideInLeft 0.3s ease-out;
|
| 65 |
+
}
|
| 66 |
+
|
| 67 |
+
@keyframes slideInRight {
|
| 68 |
+
from { transform: translateX(20px); opacity: 0; }
|
| 69 |
+
to { transform: translateX(0); opacity: 1; }
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
@keyframes slideInLeft {
|
| 73 |
+
from { transform: translateX(-20px); opacity: 0; }
|
| 74 |
+
to { transform: translateX(0); opacity: 1; }
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
/* Example Queries */
|
| 78 |
+
.example-query {
|
| 79 |
+
background: #fff8e1;
|
| 80 |
+
color: #333;
|
| 81 |
+
padding: 1rem;
|
| 82 |
+
border-radius: 10px;
|
| 83 |
+
margin: 0.5rem 0;
|
| 84 |
+
border-left: 4px solid #ff9800;
|
| 85 |
+
cursor: pointer;
|
| 86 |
+
transition: all 0.3s ease;
|
| 87 |
+
box-shadow: 0 2px 8px rgba(255, 152, 0, 0.1);
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
.example-query:hover {
|
| 91 |
+
background: #ffecb3;
|
| 92 |
+
transform: translateY(-2px);
|
| 93 |
+
box-shadow: 0 4px 12px rgba(255, 152, 0, 0.2);
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
/* Source Preview */
|
| 97 |
+
.source-preview {
|
| 98 |
+
background: #f5f5f5;
|
| 99 |
+
color: #333;
|
| 100 |
+
padding: 1rem;
|
| 101 |
+
border-radius: 10px;
|
| 102 |
+
margin: 0.5rem 0;
|
| 103 |
+
font-size: 0.9em;
|
| 104 |
+
border-left: 3px solid #6c757d;
|
| 105 |
+
}
|
| 106 |
+
|
| 107 |
+
/* Buttons */
|
| 108 |
+
.stButton > button {
|
| 109 |
+
border-radius: 10px !important;
|
| 110 |
+
font-weight: 600 !important;
|
| 111 |
+
transition: all 0.3s ease !important;
|
| 112 |
+
}
|
| 113 |
+
|
| 114 |
+
.stButton > button:hover {
|
| 115 |
+
transform: translateY(-1px) !important;
|
| 116 |
+
box-shadow: 0 4px 12px rgba(0,0,0,0.15) !important;
|
| 117 |
+
}
|
| 118 |
+
.chat-container {
|
| 119 |
+
height: calc(100vh - 180px);
|
| 120 |
+
overflow-y: auto;
|
| 121 |
+
padding: 1rem;
|
| 122 |
+
border: 1px solid #e0e0e0;
|
| 123 |
+
border-radius: 10px;
|
| 124 |
+
background-color: #fafafa;
|
| 125 |
+
margin-bottom: 1rem;
|
| 126 |
+
}
|
| 127 |
+
.fixed-input {
|
| 128 |
+
position: fixed;
|
| 129 |
+
bottom: 2rem;
|
| 130 |
+
width: 60%;
|
| 131 |
+
max-width: 800px;
|
| 132 |
+
left: 50%;
|
| 133 |
+
transform: translateX(-50%);
|
| 134 |
+
background-color: white;
|
| 135 |
+
padding: 1rem;
|
| 136 |
+
border-radius: 10px;
|
| 137 |
+
box-shadow: 0 4px 16px rgba(0,0,0,0.1);
|
| 138 |
+
z-index: 999;
|
| 139 |
+
}
|
| 140 |
+
.spacer {
|
| 141 |
+
height: 120px; /* Tambahkan spacer agar konten tak tertutup input */
|
| 142 |
+
}
|
| 143 |
+
</style>
|
| 144 |
+
""", unsafe_allow_html=True)
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
@st.cache_resource
|
| 148 |
+
def initialize_rag_system():
|
| 149 |
+
try:
|
| 150 |
+
return EnhancedMultimodalRAGSystem()
|
| 151 |
+
except Exception as e:
|
| 152 |
+
st.error(f"β Error initializing RAG system: {e}")
|
| 153 |
+
st.stop()
|
| 154 |
+
|
| 155 |
+
|
| 156 |
+
def display_example_queries():
|
| 157 |
+
"""Display clickable example queries"""
|
| 158 |
+
st.markdown("### π‘ Contoh Pertanyaan")
|
| 159 |
+
|
| 160 |
+
for category, queries in EXAMPLE_QUERIES.items():
|
| 161 |
+
with st.expander(f"{category}", expanded=True):
|
| 162 |
+
for query in queries:
|
| 163 |
+
if st.button(f"π¬ {query}", key=f"example_{hash(query)}", use_container_width=True):
|
| 164 |
+
st.session_state.user_input = query
|
| 165 |
+
st.rerun()
|
| 166 |
+
|
| 167 |
+
|
| 168 |
+
def display_tables_in_chat(table_data: List[Dict]):
|
| 169 |
+
"""Display tables directly in chat"""
|
| 170 |
+
if not table_data:
|
| 171 |
+
return
|
| 172 |
+
|
| 173 |
+
st.markdown("### π Tabel Data")
|
| 174 |
+
|
| 175 |
+
for i, table_info in enumerate(table_data, 1):
|
| 176 |
+
with st.expander(f"π {table_info['title']} (Hal. {table_info['page']}, {table_info['year']})", expanded=True):
|
| 177 |
+
|
| 178 |
+
# Table metadata
|
| 179 |
+
col1, col2, col3 = st.columns(3)
|
| 180 |
+
with col1:
|
| 181 |
+
st.metric("π Halaman", table_info['page'])
|
| 182 |
+
with col2:
|
| 183 |
+
st.metric("π
Tahun", table_info['year'])
|
| 184 |
+
with col3:
|
| 185 |
+
st.metric("π Score", f"{table_info['score']:.3f}")
|
| 186 |
+
# Display table data
|
| 187 |
+
try:
|
| 188 |
+
if table_info.get("data_type") == "dataframe" and isinstance(table_info["data"], pd.DataFrame):
|
| 189 |
+
st.dataframe(table_info["data"], use_container_width=True)
|
| 190 |
+
# Download CSV
|
| 191 |
+
csv_data = table_info["data"].to_csv(index=False)
|
| 192 |
+
st.download_button(
|
| 193 |
+
label="πΎ Download CSV",
|
| 194 |
+
data=csv_data,
|
| 195 |
+
file_name=f"table_{i}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv",
|
| 196 |
+
mime="text/csv"
|
| 197 |
+
)
|
| 198 |
+
|
| 199 |
+
elif table_info.get("data_type") == "json":
|
| 200 |
+
st.json(table_info["data"])
|
| 201 |
+
|
| 202 |
+
# Download JSON
|
| 203 |
+
json_str = json.dumps(table_info["data"], indent=2, ensure_ascii=False)
|
| 204 |
+
st.download_button(
|
| 205 |
+
label="πΎ Download JSON",
|
| 206 |
+
data=json_str,
|
| 207 |
+
file_name=f"data_{i}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json",
|
| 208 |
+
mime="application/json"
|
| 209 |
+
)
|
| 210 |
+
|
| 211 |
+
# Show description
|
| 212 |
+
if table_info.get('description'):
|
| 213 |
+
st.markdown("**π Deskripsi:**")
|
| 214 |
+
st.text(table_info['description'])
|
| 215 |
+
|
| 216 |
+
except Exception as e:
|
| 217 |
+
st.error(f"β Error displaying table: {e}")
|
| 218 |
+
|
| 219 |
+
|
| 220 |
+
def display_single_image_compact(img_info: Dict, index: int):
|
| 221 |
+
"""Display single image in compact format - CLEAN VERSION"""
|
| 222 |
+
try:
|
| 223 |
+
image_path = img_info["path"]
|
| 224 |
+
|
| 225 |
+
# Check if file exists
|
| 226 |
+
if not os.path.exists(image_path):
|
| 227 |
+
st.error(f"β Gambar {index} tidak ditemukan")
|
| 228 |
+
return
|
| 229 |
+
|
| 230 |
+
# Load and display image
|
| 231 |
+
image = Image.open(image_path)
|
| 232 |
+
|
| 233 |
+
# Display image with nice styling
|
| 234 |
+
st.image(image,
|
| 235 |
+
caption=f"π {img_info.get('title', 'Gambar')} - Hal. {img_info.get('page', 'N/A')} ({img_info.get('year', 'N/A')})",
|
| 236 |
+
use_container_width=True)
|
| 237 |
+
|
| 238 |
+
# Compact metadata
|
| 239 |
+
col1, col2 = st.columns(2)
|
| 240 |
+
with col1:
|
| 241 |
+
st.metric("π Relevance Score", f"None")
|
| 242 |
+
# {img_info.get('score', 0):.2f}")
|
| 243 |
+
with col2:
|
| 244 |
+
st.metric("π Ukuran", f"{image.width}Γ{image.height}px")
|
| 245 |
+
|
| 246 |
+
# Expandable details
|
| 247 |
+
with st.expander(f"π Detail Gambar {index}", expanded=False):
|
| 248 |
+
if img_info.get('description'):
|
| 249 |
+
st.markdown("**π Deskripsi:**")
|
| 250 |
+
st.text(img_info['description'])
|
| 251 |
+
if img_info.get('caption'):
|
| 252 |
+
st.markdown("**π¬ Caption:**")
|
| 253 |
+
st.text(img_info['caption'])
|
| 254 |
+
|
| 255 |
+
except Exception as e:
|
| 256 |
+
st.error(f"β Error loading image {index}: {str(e)}")
|
| 257 |
+
|
| 258 |
+
|
| 259 |
+
def display_single_image_full(img_info: Dict):
|
| 260 |
+
"""Display single image in full format - CLEAN VERSION"""
|
| 261 |
+
try:
|
| 262 |
+
image_path = img_info["path"]
|
| 263 |
+
|
| 264 |
+
if not os.path.exists(image_path):
|
| 265 |
+
st.error("β Gambar tidak ditemukan")
|
| 266 |
+
return
|
| 267 |
+
|
| 268 |
+
# Load image
|
| 269 |
+
image = Image.open(image_path)
|
| 270 |
+
|
| 271 |
+
# Display with title
|
| 272 |
+
st.markdown(f"### πΌοΈ {img_info.get('title', 'Gambar')}")
|
| 273 |
+
|
| 274 |
+
# Create columns for image and metadata
|
| 275 |
+
col1, col2 = st.columns([3, 1])
|
| 276 |
+
|
| 277 |
+
with col1:
|
| 278 |
+
st.image(image, use_column_width=True)
|
| 279 |
+
|
| 280 |
+
with col2:
|
| 281 |
+
st.markdown("**π Informasi Gambar**")
|
| 282 |
+
st.metric("π Halaman", img_info.get('page', 'N/A'))
|
| 283 |
+
st.metric("π
Tahun", img_info.get('year', 'N/A'))
|
| 284 |
+
# st.metric("π Score", f"{img_info.get('score', 0):.3f}")
|
| 285 |
+
st.metric("π Dimensi", f"{image.width} Γ {image.height}")
|
| 286 |
+
|
| 287 |
+
# Download button
|
| 288 |
+
with open(image_path, "rb") as file:
|
| 289 |
+
st.download_button(
|
| 290 |
+
label="πΎ Download Gambar",
|
| 291 |
+
data=file.read(),
|
| 292 |
+
file_name=os.path.basename(image_path),
|
| 293 |
+
mime="image/png",
|
| 294 |
+
use_container_width=True
|
| 295 |
+
)
|
| 296 |
+
|
| 297 |
+
# Description below image
|
| 298 |
+
if img_info.get('description'):
|
| 299 |
+
st.markdown("**π Deskripsi Gambar:**")
|
| 300 |
+
st.info(img_info['description'])
|
| 301 |
+
|
| 302 |
+
if img_info.get('caption'):
|
| 303 |
+
st.markdown("**π¬ Caption:**")
|
| 304 |
+
st.info(img_info['caption'])
|
| 305 |
+
|
| 306 |
+
except Exception as e:
|
| 307 |
+
st.error(f"β Error loading image: {str(e)}")
|
| 308 |
+
|
| 309 |
+
|
| 310 |
+
def display_images_in_chat(image_data: List[Dict], show_details: bool = True):
|
| 311 |
+
"""Display images directly in chat - CLEAN VERSION"""
|
| 312 |
+
if not image_data:
|
| 313 |
+
return
|
| 314 |
+
|
| 315 |
+
st.markdown("### πΌοΈ Gambar Terkait")
|
| 316 |
+
if len(image_data) == 1:
|
| 317 |
+
st.markdown(f"*Ditemukan 1 gambar relevan*")
|
| 318 |
+
else:
|
| 319 |
+
st.markdown(f"*Ditemukan {len(image_data)} gambar relevan*")
|
| 320 |
+
if len(image_data) > 1:
|
| 321 |
+
cols = st.columns(min(len(image_data), 2)) # Max 2 columns
|
| 322 |
+
for i, img_info in enumerate(image_data):
|
| 323 |
+
with cols[i % 2]:
|
| 324 |
+
display_single_image_compact(img_info, i+1)
|
| 325 |
+
else:
|
| 326 |
+
display_single_image_full(image_data[0])
|
| 327 |
+
|
| 328 |
+
|
| 329 |
+
def enhanced_chat_interface():
|
| 330 |
+
if 'messages' not in st.session_state:
|
| 331 |
+
st.session_state.messages = []
|
| 332 |
+
if 'user_input' not in st.session_state:
|
| 333 |
+
st.session_state.user_input = ""
|
| 334 |
+
rag_system = initialize_rag_system()
|
| 335 |
+
st.markdown("""
|
| 336 |
+
<div class="main-header">
|
| 337 |
+
<h1>π DTMI UGM Academic Assistant</h1>
|
| 338 |
+
<p>Asisten Cerdas Multimodal untuk Informasi Akademik DTMI UGM</p>
|
| 339 |
+
<p>π¬ Tanyakan apapun tentang kurikulum, silabus, gambar, dan tabel data</p>
|
| 340 |
+
</div>
|
| 341 |
+
""", unsafe_allow_html=True)
|
| 342 |
+
|
| 343 |
+
# Sidebar with controls
|
| 344 |
+
with st.sidebar:
|
| 345 |
+
st.markdown("### βοΈ Pengaturan")
|
| 346 |
+
|
| 347 |
+
# Content type preferences
|
| 348 |
+
st.markdown("### π― Preferensi Konten")
|
| 349 |
+
content_preferences = []
|
| 350 |
+
for content_type, description in CONTENT_TYPE_DESCRIPTIONS.items():
|
| 351 |
+
if st.checkbox(description, key=f"pref_{content_type}"):
|
| 352 |
+
content_preferences.append(content_type)
|
| 353 |
+
|
| 354 |
+
# Retrieval settings
|
| 355 |
+
st.markdown("### π Pengaturan Pencarian")
|
| 356 |
+
max_results = st.slider("Jumlah Konteks Maksimal", 5, 20, 10)
|
| 357 |
+
|
| 358 |
+
# Display settings
|
| 359 |
+
st.markdown("### π Tampilan")
|
| 360 |
+
show_images_inline = st.checkbox("πΌοΈ Tampilkan Gambar", value=True)
|
| 361 |
+
show_tables_inline = st.checkbox("π Tampilkan Tabel", value=True)
|
| 362 |
+
compact_mode = st.checkbox("π± Mode Kompak", value=False)
|
| 363 |
+
|
| 364 |
+
# Chat statistics
|
| 365 |
+
if st.session_state.messages:
|
| 366 |
+
st.markdown("### π Statistik")
|
| 367 |
+
total_messages = len(st.session_state.messages)
|
| 368 |
+
st.metric("π¬ Total Pesan", total_messages)
|
| 369 |
+
st.metric("π£οΈ Percakapan", total_messages // 2)
|
| 370 |
+
|
| 371 |
+
# Clear chat
|
| 372 |
+
if st.button("ποΈ Hapus Chat", type="secondary", use_container_width=True):
|
| 373 |
+
st.session_state.messages = []
|
| 374 |
+
st.rerun()
|
| 375 |
+
|
| 376 |
+
# Main chat area
|
| 377 |
+
col1, col2 = st.columns([3, 1] if not compact_mode else [1, 0])
|
| 378 |
+
|
| 379 |
+
with col1:
|
| 380 |
+
# Display chat history
|
| 381 |
+
|
| 382 |
+
for message in st.session_state.messages:
|
| 383 |
+
if message["role"] == "user":
|
| 384 |
+
st.markdown(f"""
|
| 385 |
+
<div class="user-message">
|
| 386 |
+
<strong>π€ Anda:</strong><br>
|
| 387 |
+
{message["content"]}
|
| 388 |
+
</div>
|
| 389 |
+
""", unsafe_allow_html=True)
|
| 390 |
+
else:
|
| 391 |
+
st.markdown(f"""
|
| 392 |
+
<div class="assistant-message">
|
| 393 |
+
<strong>π€ Assistant:</strong><br>
|
| 394 |
+
{message["content"]}
|
| 395 |
+
</div>
|
| 396 |
+
""", unsafe_allow_html=True)
|
| 397 |
+
|
| 398 |
+
# π― DISPLAY MULTIMODAL CONTENT
|
| 399 |
+
if "result_data" in message:
|
| 400 |
+
result_data = message["result_data"]
|
| 401 |
+
|
| 402 |
+
# Show quick stats if has multimodal content
|
| 403 |
+
if result_data.get("has_images") or result_data.get("has_tables"):
|
| 404 |
+
st.markdown("---") # Separator
|
| 405 |
+
|
| 406 |
+
col_stats1, col_stats2, col_stats3 = st.columns(3)
|
| 407 |
+
with col_stats1:
|
| 408 |
+
st.metric("πΌοΈ Gambar", len(result_data.get("image_data", [])))
|
| 409 |
+
with col_stats2:
|
| 410 |
+
st.metric("π Tabel", len(result_data.get("table_data", [])))
|
| 411 |
+
with col_stats3:
|
| 412 |
+
st.metric("π Sumber", result_data.get("total_sources", 0))
|
| 413 |
+
|
| 414 |
+
# πΌοΈ DISPLAY IMAGES
|
| 415 |
+
if show_images_inline and result_data.get("has_images"):
|
| 416 |
+
display_images_in_chat(result_data.get("image_data", []))
|
| 417 |
+
|
| 418 |
+
# π DISPLAY TABLES
|
| 419 |
+
if show_tables_inline and result_data.get("has_tables"):
|
| 420 |
+
display_tables_in_chat(result_data.get("table_data", []))
|
| 421 |
+
|
| 422 |
+
# Collapsible sources
|
| 423 |
+
if "sources" in message and message["sources"]:
|
| 424 |
+
with st.expander("π Lihat Sumber Informasi", expanded=False):
|
| 425 |
+
for i, source in enumerate(message["sources"][:3], 1):
|
| 426 |
+
content_type = source['metadata']['content_type']
|
| 427 |
+
year = source['metadata'].get('year', 'N/A')
|
| 428 |
+
page = source['metadata'].get('page', 'N/A')
|
| 429 |
+
|
| 430 |
+
st.markdown(f"""
|
| 431 |
+
**π Sumber {i}:** {CONTENT_TYPE_DESCRIPTIONS.get(content_type, content_type)}
|
| 432 |
+
**π
Tahun:** {year} | **π Halaman:** {page}
|
| 433 |
+
**π Preview:** {source['content'][:150]}...
|
| 434 |
+
""")
|
| 435 |
+
st.markdown("---")
|
| 436 |
+
|
| 437 |
+
# Chat input
|
| 438 |
+
user_input = st.chat_input(
|
| 439 |
+
"π¬ Tanyakan tentang kurikulum, gambar, tabel, atau informasi lainnya...", key="chat_input")
|
| 440 |
+
|
| 441 |
+
# Handle example query selection
|
| 442 |
+
if st.session_state.user_input:
|
| 443 |
+
user_input = st.session_state.user_input
|
| 444 |
+
st.session_state.user_input = ""
|
| 445 |
+
|
| 446 |
+
# π PROCESS USER INPUT
|
| 447 |
+
if user_input:
|
| 448 |
+
# Add user message
|
| 449 |
+
st.session_state.messages.append({"role": "user", "content": user_input})
|
| 450 |
+
|
| 451 |
+
# Show loading
|
| 452 |
+
with st.spinner("π Mencari informasi relevan..."):
|
| 453 |
+
try:
|
| 454 |
+
result_data = rag_system.query(
|
| 455 |
+
user_input,
|
| 456 |
+
k=max_results,
|
| 457 |
+
content_filter=content_preferences if content_preferences else None
|
| 458 |
+
)
|
| 459 |
+
|
| 460 |
+
# Save assistant message with complete data
|
| 461 |
+
assistant_message = {
|
| 462 |
+
"role": "assistant",
|
| 463 |
+
"content": result_data["answer"],
|
| 464 |
+
"sources": result_data["sources"],
|
| 465 |
+
"result_data": result_data
|
| 466 |
+
}
|
| 467 |
+
|
| 468 |
+
st.session_state.messages.append(assistant_message)
|
| 469 |
+
|
| 470 |
+
except Exception as e:
|
| 471 |
+
st.error(f"β Terjadi kesalahan: {e}")
|
| 472 |
+
st.session_state.messages.append({
|
| 473 |
+
"role": "assistant",
|
| 474 |
+
"content": "Maaf, terjadi kesalahan dalam memproses pertanyaan Anda. Silakan coba lagi."
|
| 475 |
+
})
|
| 476 |
+
|
| 477 |
+
st.rerun()
|
| 478 |
+
|
| 479 |
+
# Sidebar dengan example queries (only if not compact)
|
| 480 |
+
if not compact_mode:
|
| 481 |
+
with col2:
|
| 482 |
+
display_example_queries()
|
| 483 |
+
|
| 484 |
+
# Quick actions
|
| 485 |
+
st.markdown("### β‘ Aksi Cepat")
|
| 486 |
+
|
| 487 |
+
quick_actions = [
|
| 488 |
+
("πΌοΈ Cari Gambar", "Tampilkan gambar formulir atau diagram"),
|
| 489 |
+
("π Lihat Tabel", "Tabel kurikulum semester 1"),
|
| 490 |
+
("π Info Program", "Informasi program studi teknik mesin"),
|
| 491 |
+
("π Silabus", "Silabus mata kuliah wajib")
|
| 492 |
+
]
|
| 493 |
+
|
| 494 |
+
for label, query in quick_actions:
|
| 495 |
+
if st.button(label, use_container_width=True):
|
| 496 |
+
st.session_state.user_input = query
|
| 497 |
+
st.rerun()
|
| 498 |
+
if st.session_state.messages:
|
| 499 |
+
st.markdown("### π€ Export")
|
| 500 |
+
if st.button("πΎ Download Chat", use_container_width=True):
|
| 501 |
+
chat_export = ""
|
| 502 |
+
for msg in st.session_state.messages:
|
| 503 |
+
role = "User" if msg["role"] == "user" else "Assistant"
|
| 504 |
+
chat_export += f"**{role}:** {msg['content']}\n\n"
|
| 505 |
+
|
| 506 |
+
st.download_button(
|
| 507 |
+
label="π Download Markdown",
|
| 508 |
+
data=chat_export,
|
| 509 |
+
file_name=f"chat_dtmi_{datetime.now().strftime('%Y%m%d_%H%M%S')}.md",
|
| 510 |
+
mime="text/markdown",
|
| 511 |
+
use_container_width=True
|
| 512 |
+
)
|
| 513 |
+
|
| 514 |
+
|
| 515 |
+
def main():
|
| 516 |
+
"""Main application function"""
|
| 517 |
+
enhanced_chat_interface()
|
| 518 |
+
|
| 519 |
+
|
| 520 |
+
if __name__ == "__main__":
|
| 521 |
+
main()
|
src/year_parser.py
CHANGED
|
@@ -1,3 +1,60 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import re
|
| 3 |
+
from typing import List, Tuple
|
| 4 |
+
from config import *
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class YearParser:
|
| 8 |
+
VALID_YEARS = [2022, 2023, 2024]
|
| 9 |
+
|
| 10 |
+
@staticmethod
|
| 11 |
+
def extract_years(query: str) -> Tuple[List[int], str, bool, bool]:
|
| 12 |
+
years = []
|
| 13 |
+
cleaned_query = query
|
| 14 |
+
user_mentioned_year = False
|
| 15 |
+
user_mentioned_invalid_year = False
|
| 16 |
+
|
| 17 |
+
single_year_pattern = r'\b(20\d{2})\b'
|
| 18 |
+
single_years = re.findall(single_year_pattern, query)
|
| 19 |
+
|
| 20 |
+
range_patterns = [
|
| 21 |
+
r'\b(20\d{2})\s*-\s*(20\d{2})\b', # 2022-2024
|
| 22 |
+
r'\b(20\d{2})\s+sampai\s+(20\d{2})\b', # 2022 sampai 2024
|
| 23 |
+
r'\b(20\d{2})\s+hingga\s+(20\d{2})\b', # 2022 hingga 2024
|
| 24 |
+
r'\b(20\d{2})\s+s\.?d\.?\s+(20\d{2})\b', # 2022 s.d 2024
|
| 25 |
+
]
|
| 26 |
+
range_found = False
|
| 27 |
+
|
| 28 |
+
for pattern in range_patterns:
|
| 29 |
+
matches = re.findall(pattern, query, re.IGNORECASE)
|
| 30 |
+
if matches:
|
| 31 |
+
user_mentioned_year = True
|
| 32 |
+
for start_year, end_year in matches:
|
| 33 |
+
start = int(start_year)
|
| 34 |
+
end = int(end_year)
|
| 35 |
+
for year in range(start, end + 1):
|
| 36 |
+
if year in YearParser.VALID_YEARS:
|
| 37 |
+
years.append(year)
|
| 38 |
+
else:
|
| 39 |
+
user_mentioned_invalid_year = True
|
| 40 |
+
range_found = True
|
| 41 |
+
cleaned_query = re.sub(pattern, '', cleaned_query, flags=re.IGNORECASE)
|
| 42 |
+
|
| 43 |
+
if not range_found and single_years:
|
| 44 |
+
user_mentioned_year = True
|
| 45 |
+
for year in single_years:
|
| 46 |
+
y = int(year)
|
| 47 |
+
if y in YearParser.VALID_YEARS:
|
| 48 |
+
years.append(y)
|
| 49 |
+
else:
|
| 50 |
+
user_mentioned_invalid_year = True
|
| 51 |
+
cleaned_query = re.sub(single_year_pattern, '', cleaned_query)
|
| 52 |
+
|
| 53 |
+
# Tidak fallback ke semua tahun valid kalau user_mentioned_year True tapi semua tahun tidak valid
|
| 54 |
+
if not years and not user_mentioned_year:
|
| 55 |
+
years = YearParser.VALID_YEARS.copy()
|
| 56 |
+
|
| 57 |
+
cleaned_query = re.sub(r'\s+', ' ', cleaned_query).strip()
|
| 58 |
+
cleaned_query = re.sub(r'^[,\-\s]+|[,\-\s]+$', '', cleaned_query)
|
| 59 |
+
|
| 60 |
+
return list(sorted(set(years))), cleaned_query, user_mentioned_year, user_mentioned_invalid_year
|