Spaces:
Runtime error
Runtime error
| #!/usr/bin/env python3 | |
| # -*- coding: utf-8 -*- | |
| import sys | |
| import os | |
| sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) | |
| from tqdm import tqdm | |
| from langchain.vectorstores import Chroma | |
| from langchain.embeddings import HuggingFaceEmbeddings | |
| from langchain.schema import Document | |
| from coachable_course_agent.load_data import load_courses | |
| def clean_provider_name(provider): | |
| """ | |
| Clean provider name by removing duplicate first letters. | |
| If the first two letters are the same and both uppercase, drop the first one. | |
| E.g., "IIllinois Tech" -> "Illinois Tech", "IIBM" -> "IBM" | |
| """ | |
| if not provider or len(provider) < 2: | |
| return provider | |
| # Check if first two letters are the same and both uppercase | |
| if provider[0] == provider[1] and provider[0].isupper() and provider[1].isupper(): | |
| return provider[1:] # Drop the first letter | |
| return provider | |
| # 1. Load course catalog (including ESCO-linked skills) | |
| course_data = load_courses("data/course_catalog_esco.json") | |
| # Handle both old format (direct array) and new format (with metadata) | |
| if isinstance(course_data, dict) and 'courses' in course_data: | |
| courses = course_data['courses'] | |
| print(f"π Loaded {len(courses)} courses from catalog with metadata") | |
| elif isinstance(course_data, list): | |
| courses = course_data | |
| print(f"π Loaded {len(courses)} courses from legacy format") | |
| else: | |
| print("β Unexpected course data format") | |
| sys.exit(1) | |
| # 2. Initialize embedding model | |
| embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2", show_progress=True) | |
| # 3. Convert to LangChain documents | |
| course_docs = [] | |
| for course in tqdm(courses, desc="Processing courses"): | |
| skill_names = ", ".join([s["name"] for s in course.get("skills", [])]) | |
| content = f"{course['title']}. Skills: {skill_names}" | |
| # Clean the provider name during loading | |
| cleaned_provider = clean_provider_name(course["provider"]) | |
| metadata = { | |
| "id": course["id"], | |
| "title": course["title"], | |
| "provider": cleaned_provider, # Use cleaned provider name | |
| "source_platform": course.get("source_platform", ""), | |
| "duration_hours": course.get("duration_hours", 0), | |
| "level": course.get("level", "Unknown"), | |
| "format": course.get("format", "Unknown"), | |
| "url": course.get("url", ""), | |
| "skills": skill_names | |
| } | |
| course_docs.append(Document(page_content=content, metadata=metadata)) | |
| # 4. Save to persistent ChromaDB | |
| persist_dir = "data/courses_chroma" | |
| vectorstore = Chroma.from_documents( | |
| documents=course_docs, | |
| embedding=embedding_model, | |
| persist_directory=persist_dir | |
| ) | |
| vectorstore.persist() | |
| print(f"β Stored {len(course_docs)} courses to ChromaDB at '{persist_dir}'") | |