Spaces:

wicaksonolxn
/

optima

Build error

File size: 14,878 Bytes

import os
import json
from typing import List, Dict, Any, Optional
from datetime import datetime
from dotenv import load_dotenv
from pathlib import Path

# LangChain imports
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.schema import Document
load_dotenv()


class LangChainMultimodalVectorizer:
    def __init__(self):
        self.embeddings = OpenAIEmbeddings(
            # openai_api_key=os.getenv("OPENAI_API_KEY"),
            # model=os.getenv("OPENAI_EMBEDDING_MODEL", "text-embedding-ada-002")
        )
        # self.persist_dir = os.getenv("CHROMA_PERSIST_DIR", "./chroma_persist")

    def get_or_create_vectorstore(self, year: int) -> Chroma:
        """Get or create Chroma vectorstore for specific year"""
        collection_name = f"optima_multimodal_{year}"

        # Create persist directory for this year
        year_persist_dir = os.path.join(self.persist_dir, f"year_{year}")
        os.makedirs(year_persist_dir, exist_ok=True)

        try:
            # Try to load existing vectorstore
            vectorstore = Chroma(
                collection_name=collection_name,
                embedding_function=self.embeddings,
                persist_directory=year_persist_dir
            )

            # Check if collection exists and has documents
            if vectorstore._collection.count() > 0:
                print(f"📚 Using existing vectorstore: {collection_name} ({vectorstore._collection.count()} docs)")
            else:
                print(f"🆕 Created new vectorstore: {collection_name}")

        except Exception as e:
            print(f"🆕 Creating new vectorstore: {collection_name}")
            vectorstore = Chroma(
                collection_name=collection_name,
                embedding_function=self.embeddings,
                persist_directory=year_persist_dir
            )

        return vectorstore

    def create_embedding_text(self, item: Dict[str, Any]) -> str:
        """Create optimized text for embedding based on content_type"""
        content_type = item.get("content_type", "")
        content = item.get("content", "")
        context_text = item.get("context_text", "")

        # Create rich embedding text based on content_type
        if content_type == "silabus":
            mata_kuliah = item.get("mata_kuliah", "")
            course_code = item.get("course_code", "")
            silabus_type = item.get("silabus_type", "")
            program = item.get("program", "")
            semester = item.get("semester", "")

            embedding_text = f"Silabus {program} semester {semester} {mata_kuliah} {course_code} {silabus_type}: {content} {context_text}"

        elif content_type == "curriculum":
            program = item.get("program", "")
            semester = item.get("semester", "")
            table_type = item.get("table_type", "")

            embedding_text = f"Kurikulum {program} semester {semester} {table_type}: {content} {context_text}"

        elif content_type == "image":
            title = item.get("title", "")
            caption = item.get("caption", "")

            embedding_text = f"Gambar: {title} {caption} {content} {context_text}"

        elif content_type == "table":
            title = item.get("title", "")
            caption = item.get("caption", "")
            rows = item.get("rows", 0)
            cols = item.get("cols", 0)

            embedding_text = f"Tabel {rows}x{cols}: {title} {caption} {content} {context_text}"

        else:  # text_chunk
            chapter = item.get("chapter", "")
            section = item.get("section", "")

            embedding_text = f"Teks {chapter} {section}: {content} {context_text}"

        return embedding_text

    def prepare_document_metadata(self, item: Dict[str, Any]) -> Dict[str, Any]:
        """Prepare metadata for LangChain Document"""
        content_type = item.get("content_type", "")

        # Base metadata (common for all types)
        metadata = {
            "id": item.get("id", ""),
            "content_type": content_type,
            "year": item.get("year", 0),
            "page": item.get("page", 0),
            "filename": item.get("filename", "")[:200],
            "filepath": item.get("filepath", "")[:300],
            "extracted_at": item.get("extracted_at", "")
        }

        # Add specific metadata based on content_type
        if content_type == "silabus":
            metadata.update({
                "mata_kuliah": item.get("mata_kuliah", "")[:200],
                "course_code": item.get("course_code", ""),
                "sks": item.get("sks", ""),
                "program": item.get("program", ""),
                "semester": item.get("semester", ""),
                "silabus_type": item.get("silabus_type", "")
            })

        elif content_type == "curriculum":
            metadata.update({
                "program": item.get("program", ""),
                "semester": item.get("semester", ""),
                "table_type": item.get("table_type", ""),
                "content_type_detail": item.get("content_type_detail", ""),
                "rows_count": item.get("rows_count", 0)
            })

        elif content_type == "image":
            metadata.update({
                "title": item.get("title", "")[:200],
                "caption": item.get("caption", "")[:300],
                "image_index": item.get("image_index", 0),
                "image_path": item.get("filepath", "")
            })

        elif content_type == "table":
            metadata.update({
                "title": item.get("title", "")[:200],
                "caption": item.get("caption", "")[:300],
                "table_index": item.get("table_index", 0),
                "rows": item.get("rows", 0),
                "cols": item.get("cols", 0),
                "table_path": item.get("filepath", "")
            })

        else:  # text_chunk
            metadata.update({
                "chapter": item.get("chapter", "")[:200],
                "section": item.get("section", "")[:200],
                "subsection": item.get("subsection", "")[:200],
                "chunk_type": item.get("chunk_type", ""),
                "quality_score": item.get("quality_score", 0.0)
            })

        return metadata

    def process_unified_json(self, json_file_path: str, year: int) -> Dict[str, int]:
        """Process unified multimodal JSON file using LangChain"""

        if not os.path.exists(json_file_path):
            print(f"❌ File not found: {json_file_path}")
            return {}

        print(f"🔄 Processing: {json_file_path}")

        with open(json_file_path, 'r', encoding='utf-8') as f:
            raw_data = json.load(f)

        # 🔧 Handle different JSON structures
        if isinstance(raw_data, dict):
            if 'content' in raw_data:
                data = raw_data['content']  # Extract from content array
                print(f"📦 Detected structured JSON with 'content' key")
            else:
                print(f"❌ Unexpected JSON structure: {list(raw_data.keys())}")
                return {}
        elif isinstance(raw_data, list):
            data = raw_data  # Direct array
            print(f"📦 Detected direct array JSON")
        else:
            print(f"❌ Unexpected JSON type: {type(raw_data)}")
            return {}

        # Get vectorstore for this year
        vectorstore = self.get_or_create_vectorstore(year)

        # Statistics
        stats = {
            "text_chunk": 0,
            "image": 0,
            "table": 0,
            "curriculum": 0,
            "silabus": 0,
            "total": 0,
            "errors": 0,
            "skipped": 0
        }

        print(f"📊 Found {len(data)} items for year {year}")

        # Prepare documents for batch processing
        documents = []
        batch_size = 50

        for idx, item in enumerate(data):
            try:
                # 🔧 Ensure item is dict
                if not isinstance(item, dict):
                    print(f"⚠️ Skipping non-dict item at index {idx}: {type(item)}")
                    stats["skipped"] += 1
                    continue

                content_type = item.get("content_type", "unknown")
                content = item.get("content", "")
                context_text = item.get("context_text", "")

                # Skip if no meaningful content
                if not content and not context_text:
                    stats["skipped"] += 1
                    continue

                if len(str(content).strip()) < 3 and len(str(context_text).strip()) < 10:
                    stats["skipped"] += 1
                    continue

                # Create embedding text
                embedding_text = self.create_embedding_text(item)

                # Prepare metadata
                metadata = self.prepare_document_metadata(item)

                # Create LangChain Document
                doc = Document(
                    page_content=embedding_text,
                    metadata=metadata
                )

                documents.append(doc)

                # Update stats
                if content_type in stats:
                    stats[content_type] += 1
                else:
                    stats["unknown"] = stats.get("unknown", 0) + 1
                stats["total"] += 1

                # Process batch when full
                if len(documents) >= batch_size:
                    self.add_documents_to_vectorstore(vectorstore, documents)
                    print(f"  ✅ Processed batch {stats['total']//batch_size} ({stats['total']} items)")
                    documents = []  # Reset batch

            except Exception as e:
                print(f"❌ Error processing item {idx}: {e}")
                print(f"   Item type: {type(item)}")
                if isinstance(item, dict):
                    print(f"   Item keys: {list(item.keys())[:5]}...")
                else:
                    print(f"   Item content preview: {str(item)[:100]}...")
                stats["errors"] += 1

        # Process remaining documents
        if documents:
            self.add_documents_to_vectorstore(vectorstore, documents)

        # Persist the vectorstore
        vectorstore.persist()

        print(f"📊 Processing complete for year {year}:")
        for key, value in stats.items():
            if value > 0:
                print(f"  📝 {key}: {value}")

        return stats

    def add_documents_to_vectorstore(self, vectorstore: Chroma, documents: List[Document]):
        """Add documents to vectorstore"""
        try:
            vectorstore.add_documents(documents)
        except Exception as e:
            print(f"❌ Error adding documents to vectorstore: {e}")

    def query_multimodal(self, query_text: str, year: Optional[int] = None,
                         content_types: Optional[List[str]] = None,
                         n_results: int = 10) -> List[Dict]:
        results = []
        years_to_search = [year] if year else [2022, 2023, 2024]

        for search_year in years_to_search:
            try:
                vectorstore = self.get_or_create_vectorstore(search_year)

                # Build filter for content types
                search_kwargs = {"k": n_results}
                if content_types:
                    search_kwargs["filter"] = {"content_type": {"$in": content_types}}

                # Perform similarity search
                docs = vectorstore.similarity_search_with_score(
                    query_text,
                    k=n_results,
                    filter=search_kwargs.get("filter")
                )

                # Format results
                for doc, score in docs:
                    result = {
                        "content": doc.page_content,
                        "metadata": doc.metadata,
                        "score": score,
                        "year": search_year
                    }

                    # Add special handling for images
                    if result["metadata"]["content_type"] == "image":
                        result["image_path"] = result["metadata"].get("image_path", "")
                        result["retrievable"] = os.path.exists(result["image_path"]) if result["image_path"] else False

                    # Add special handling for tables
                    elif result["metadata"]["content_type"] == "table":
                        result["table_path"] = result["metadata"].get("table_path", "")
                        result["retrievable"] = os.path.exists(result["table_path"]) if result["table_path"] else False

                    results.append(result)

            except Exception as e:
                print(f"❌ Error querying year {search_year}: {e}")

        # Sort by score (lower is better for distance-based scoring)
        results.sort(key=lambda x: x["score"])
        return results[:n_results]

    def get_vectorstore_stats(self, year: int) -> Dict:
        """Get statistics for a vectorstore"""
        try:
            vectorstore = self.get_or_create_vectorstore(year)
            count = vectorstore._collection.count()

            return {
                "year": year,
                "total_documents": count,
                "collection_name": f"optima_multimodal_{year}"
            }
        except Exception as e:
            print(f"❌ Error getting stats for year {year}: {e}")
            return {"year": year, "total_documents": 0, "error": str(e)}


def process_all_unified_files(data_dir: str = "./chunked"):
    vectorizer = LangChainMultimodalVectorizer()
    years = [2022, 2023, 2024]
    total_stats = {"total": 0, "errors": 0}

    for year in years:
        json_file = os.path.join(data_dir, f"multimodal_unified_{year}.json")

        if not os.path.exists(json_file):
            print(f"⚠️ File not found: {json_file}")
            continue

        print(f"\n🔄 Processing year {year}...")

        stats = vectorizer.process_unified_json(json_file, year)

        if stats:
            print(f"📊 Year {year} Final Statistics:")
            for content_type, count in stats.items():
                print(f"  📝 {content_type}: {count}")

            total_stats["total"] += stats.get("total", 0)
            total_stats["errors"] += stats.get("errors", 0)

    print(f"\n🎉 FINAL PROCESSING SUMMARY:")
    print(f"  🎯 Total documents processed: {total_stats['total']}")
    print(f"  ❌ Total errors: {total_stats['errors']}")

    # Show vectorstore stats
    print(f"\n📚 VECTORSTORE STATISTICS:")
    for year in years:
        stats = vectorizer.get_vectorstore_stats(year)
        print(f"  {year}: {stats['total_documents']} documents")


if __name__ == "__main__":
    process_all_unified_files()