rosvend commited on
Commit
c03c816
·
1 Parent(s): 045cdca

feat: added chunking using MMR and BM25

Browse files
pyproject.toml CHANGED
@@ -15,4 +15,5 @@ dependencies = [
15
  "playwright>=1.55.0",
16
  "python-dotenv>=1.0.0",
17
  "faiss-cpu>=1.9.0",
 
18
  ]
 
15
  "playwright>=1.55.0",
16
  "python-dotenv>=1.0.0",
17
  "faiss-cpu>=1.9.0",
18
+ "rank-bm25>=0.2.2", # For BM25 sparse retrieval
19
  ]
src/embeddings/embeddings.py ADDED
File without changes
src/loader/ingest.py CHANGED
@@ -1,90 +1,70 @@
1
  """
2
- UPB Career Data Ingestion Pipeline
3
- Scrapes UPB engineering program pages and saves documents for RAG
4
  """
5
 
6
  from pathlib import Path
7
- import json
8
- from data_loader import load_upb_careers
9
- from config import UPB_ENGINEERING_URLS, TEST_URLS
10
 
11
- # Paths
12
- CURRENT_DIR = Path(__file__).resolve().parent
13
- DATA_DIR = CURRENT_DIR.parent / "data"
14
- RAW_HTML_DIR = DATA_DIR / "raw_html"
15
- PROCESSED_DIR = DATA_DIR / "processed"
16
 
17
- # Create directories
18
- RAW_HTML_DIR.mkdir(parents=True, exist_ok=True)
19
- PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
20
-
21
-
22
- def save_documents_json(documents, output_file: Path):
23
- """Save documents to JSON file"""
24
- doc_dicts = [
25
- {
26
- "page_content": doc.page_content,
27
- "metadata": doc.metadata
28
- }
29
- for doc in documents
30
- ]
31
-
32
- with open(output_file, "w", encoding="utf-8") as f:
33
- json.dump(doc_dicts, f, ensure_ascii=False, indent=2)
34
-
35
- print(f"💾 Saved {len(documents)} documents to {output_file}")
36
-
37
-
38
- def ingest_upb_data(test_mode: bool = False):
39
  """
40
- Main ingestion pipeline
41
 
42
  Args:
43
- test_mode: If True, only scrape TEST_URLS. Otherwise scrape all programs.
44
- """
45
- urls = TEST_URLS if test_mode else UPB_ENGINEERING_URLS
46
-
47
- print("=" * 70)
48
- print("UPB CAREER DATA INGESTION")
49
- print("=" * 70)
50
- print(f"Mode: {'TEST' if test_mode else 'FULL'}")
51
- print(f"URLs to scrape: {len(urls)}\n")
52
 
53
- # Load documents
54
- print("🚀 Starting data collection...\n")
55
- documents = load_upb_careers(urls, save_html=True)
 
 
 
56
 
57
- # Save processed documents
58
- output_file = PROCESSED_DIR / ("upb_careers_test.json" if test_mode else "upb_careers_all.json")
59
- save_documents_json(documents, output_file)
 
 
 
 
 
60
 
61
- # Print summary
62
- print("\n" + "=" * 70)
63
- print("INGESTION SUMMARY")
64
- print("=" * 70)
65
- print(f"✅ Documents loaded: {len(documents)}")
66
- print(f"📊 Total characters: {sum(doc.metadata['char_count'] for doc in documents):,}")
67
- print(f"📁 Raw HTML saved to: {RAW_HTML_DIR}")
68
- print(f"📁 Processed data saved to: {output_file}")
69
 
70
- # Show document titles
71
- print("\n📚 Loaded programs:")
72
- for i, doc in enumerate(documents, 1):
73
- print(f" {i}. {doc.metadata['title']} ({doc.metadata['char_count']:,} chars)")
 
 
 
 
 
 
 
 
 
 
 
 
74
 
75
  return documents
76
 
77
 
78
  if __name__ == "__main__":
79
- import sys
 
80
 
81
- # Check if user wants full ingestion
82
- test_mode = True
83
- if len(sys.argv) > 1 and sys.argv[1] == "--full":
84
- test_mode = False
85
- print("⚠️ Running FULL ingestion (all engineering programs)")
86
- print("This will take several minutes...\n")
87
 
88
- documents = ingest_upb_data(test_mode=test_mode)
89
-
90
- print("\n✨ Ingestion complete! Documents are ready for RAG processing.")
 
 
 
 
 
 
 
1
  """
2
+ Document Loader Module
3
+ Loads markdown files from the data/ directory with metadata enrichment.
4
  """
5
 
6
  from pathlib import Path
7
+ from langchain_community.document_loaders import DirectoryLoader, TextLoader
 
 
8
 
 
 
 
 
 
9
 
10
+ def load_upb_documents(show_progress=True):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  """
12
+ Load all markdown files from data/ directory and subdirectories.
13
 
14
  Args:
15
+ show_progress: Whether to show progress bar (default: True)
 
 
 
 
 
 
 
 
16
 
17
+ Returns:
18
+ list: List of LangChain Document objects with content and metadata
19
+ """
20
+ # Get data directory path
21
+ current_dir = Path(__file__).resolve().parent
22
+ data_dir = current_dir.parent.parent / "data"
23
 
24
+ # Load all .md files recursively
25
+ loader = DirectoryLoader(
26
+ str(data_dir),
27
+ glob="**/*.md",
28
+ loader_cls=TextLoader,
29
+ show_progress=show_progress,
30
+ use_multithreading=True
31
+ )
32
 
33
+ documents = loader.load()
 
 
 
 
 
 
 
34
 
35
+ # Add source category to metadata based on subdirectory
36
+ for doc in documents:
37
+ source_path = Path(doc.metadata['source'])
38
+ relative_path = source_path.relative_to(data_dir)
39
+
40
+ # Determine category from subdirectory
41
+ if relative_path.parts[0] == 'engineerings':
42
+ doc.metadata['category'] = 'engineering'
43
+ elif relative_path.parts[0] == 'contact':
44
+ doc.metadata['category'] = 'contact'
45
+ elif relative_path.parts[0] == 'enroll':
46
+ doc.metadata['category'] = 'enrollment'
47
+ elif relative_path.parts[0] == 'scholarships':
48
+ doc.metadata['category'] = 'scholarships'
49
+ else:
50
+ doc.metadata['category'] = 'general'
51
 
52
  return documents
53
 
54
 
55
  if __name__ == "__main__":
56
+ print("🚀 Loading markdown files from data/ directory...\n")
57
+ documents = load_upb_documents()
58
 
59
+ print(f"\n✅ Loaded {len(documents)} documents")
60
+ print(f"📊 Total characters: {sum(len(doc.page_content) for doc in documents):,}")
 
 
 
 
61
 
62
+ # Group by category
63
+ categories = {}
64
+ for doc in documents:
65
+ cat = doc.metadata.get('category', 'unknown')
66
+ categories[cat] = categories.get(cat, 0) + 1
67
+
68
+ print("\n📚 Documents by category:")
69
+ for cat, count in sorted(categories.items()):
70
+ print(f" - {cat}: {count} documents")
src/pipeline.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Main Data Pipeline
3
+ Orchestrates the complete flow: load → chunk → ready for retrieval.
4
+ """
5
+
6
+ from pathlib import Path
7
+ import sys
8
+
9
+ # Add src to path
10
+ sys.path.insert(0, str(Path(__file__).parent))
11
+
12
+ from loader.ingest import load_upb_documents
13
+ from processing.chunking import chunk_documents
14
+
15
+
16
+ def prepare_documents_for_rag(chunk_size=1000, chunk_overlap=200, show_progress=True):
17
+ """
18
+ Complete data preparation pipeline.
19
+
20
+ Args:
21
+ chunk_size: Maximum characters per chunk
22
+ chunk_overlap: Overlap between chunks in characters
23
+ show_progress: Show loading progress bar
24
+
25
+ Returns:
26
+ list: Chunked documents ready for embedding and retrieval
27
+ """
28
+ # Step 1: Load documents
29
+ documents = load_upb_documents(show_progress=show_progress)
30
+
31
+ # Step 2: Chunk documents
32
+ chunks = chunk_documents(documents, chunk_size=chunk_size, chunk_overlap=chunk_overlap)
33
+
34
+ return chunks
35
+
36
+
37
+ if __name__ == "__main__":
38
+ print("=" * 70)
39
+ print("UPB RAG DATA PIPELINE")
40
+ print("=" * 70)
41
+ print("\n📋 Pipeline: Load → Chunk → Ready for Retrieval\n")
42
+
43
+ chunks = prepare_documents_for_rag()
44
+
45
+ print(f"\n✅ Pipeline complete!")
46
+ print(f"📊 Generated {len(chunks)} chunks")
47
+ print(f"📊 Average size: {sum(len(c.page_content) for c in chunks) // len(chunks)} chars")
48
+
49
+ # Statistics
50
+ categories = {}
51
+ for chunk in chunks:
52
+ cat = chunk.metadata.get('category', 'unknown')
53
+ categories[cat] = categories.get(cat, 0) + 1
54
+
55
+ print("\n📦 Distribution:")
56
+ for cat, count in sorted(categories.items(), key=lambda x: -x[1]):
57
+ percentage = (count / len(chunks)) * 100
58
+ print(f" - {cat}: {count} chunks ({percentage:.1f}%)")
59
+
60
+ print("\n✨ Ready for embedding and retrieval!")
src/processing/chunking.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Document Chunking Module
3
+ Splits documents into smaller chunks optimized for embedding and retrieval.
4
+ """
5
+
6
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
7
+
8
+
9
+ def chunk_documents(documents, chunk_size=1000, chunk_overlap=200):
10
+ """
11
+ Split documents into smaller chunks for embedding.
12
+
13
+ Args:
14
+ documents: List of LangChain Document objects
15
+ chunk_size: Maximum size of each chunk in characters (default: 1000)
16
+ chunk_overlap: Number of characters to overlap between chunks (default: 200)
17
+
18
+ Returns:
19
+ list: List of chunked Document objects with preserved metadata
20
+ """
21
+ text_splitter = RecursiveCharacterTextSplitter(
22
+ chunk_size=chunk_size,
23
+ chunk_overlap=chunk_overlap,
24
+ length_function=len,
25
+ add_start_index=True, # Track position in original document
26
+ separators=[
27
+ "\n\n", # Paragraphs (preferred)
28
+ "\n", # Lines
29
+ " ", # Words
30
+ "" # Characters (fallback)
31
+ ]
32
+ )
33
+
34
+ chunks = text_splitter.split_documents(documents)
35
+ return chunks
36
+
37
+
38
+ if __name__ == "__main__":
39
+ from pathlib import Path
40
+ import sys
41
+
42
+ # Add src to path
43
+ sys.path.insert(0, str(Path(__file__).parent.parent))
44
+
45
+ from loader.ingest import load_upb_documents
46
+
47
+ print("🚀 Loading documents...\n")
48
+ documents = load_upb_documents()
49
+
50
+ print(f"✅ Loaded {len(documents)} documents")
51
+ print(f"📊 Total characters: {sum(len(doc.page_content) for doc in documents):,}\n")
52
+
53
+ print("✂️ Chunking documents...")
54
+ chunks = chunk_documents(documents)
55
+
56
+ print(f"\n✅ Created {len(chunks)} chunks")
57
+ print(f"📊 Average chunk size: {sum(len(c.page_content) for c in chunks) // len(chunks):,} characters")
58
+
59
+ # Show chunks by category
60
+ chunk_categories = {}
61
+ for chunk in chunks:
62
+ cat = chunk.metadata.get('category', 'unknown')
63
+ chunk_categories[cat] = chunk_categories.get(cat, 0) + 1
64
+
65
+ print("\n📦 Chunks by category:")
66
+ for cat, count in sorted(chunk_categories.items()):
67
+ print(f" - {cat}: {count} chunks")
68
+
69
+ print("\n✨ Chunks ready for embedding!")
src/{rag → retrieval}/rag_pipeline.py RENAMED
File without changes
src/retrieval/retriever.py ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Retrieval Module with Multiple Search Strategies
3
+ Implements dense (vector), sparse (BM25), and MMR-based retrieval.
4
+ """
5
+
6
+ from typing import List, Literal
7
+ from langchain_core.documents import Document
8
+ from langchain_community.retrievers import BM25Retriever
9
+
10
+
11
+ class SimpleEnsembleRetriever:
12
+ """Simple ensemble retriever that combines results from multiple retrievers."""
13
+
14
+ def __init__(self, retrievers: List, weights: List[float]):
15
+ self.retrievers = retrievers
16
+ self.weights = weights
17
+
18
+ def invoke(self, query: str) -> List[Document]:
19
+ """Combine results from all retrievers with weighted scores."""
20
+ all_results = []
21
+ seen_content = set()
22
+
23
+ for retriever, weight in zip(self.retrievers, self.weights):
24
+ results = retriever.invoke(query)
25
+ for doc in results:
26
+ # Simple deduplication by content
27
+ if doc.page_content not in seen_content:
28
+ all_results.append(doc)
29
+ seen_content.add(doc.page_content)
30
+
31
+ return all_results
32
+
33
+
34
+ class UPBRetriever:
35
+ """
36
+ Multi-strategy retriever for UPB career documents.
37
+ Supports: similarity search, MMR, BM25, and hybrid retrieval.
38
+ """
39
+
40
+ def __init__(self, chunks: List[Document], vectorstore=None):
41
+ """
42
+ Initialize retriever with document chunks.
43
+
44
+ Args:
45
+ chunks: List of chunked Document objects
46
+ vectorstore: Optional FAISS/ChromaDB vectorstore for dense retrieval
47
+ """
48
+ self.chunks = chunks
49
+ self.vectorstore = vectorstore
50
+ self._bm25_retriever = None
51
+
52
+ def get_bm25_retriever(self, k: int = 4) -> BM25Retriever:
53
+ """
54
+ Get or create BM25 retriever for sparse keyword-based search.
55
+
56
+ Args:
57
+ k: Number of documents to retrieve
58
+
59
+ Returns:
60
+ BM25Retriever instance
61
+ """
62
+ if self._bm25_retriever is None:
63
+ self._bm25_retriever = BM25Retriever.from_documents(self.chunks)
64
+
65
+ self._bm25_retriever.k = k
66
+ return self._bm25_retriever
67
+
68
+ def get_dense_retriever(self, k: int = 4, search_type: Literal["similarity", "mmr"] = "similarity"):
69
+ """
70
+ Get dense retriever from vectorstore.
71
+
72
+ Args:
73
+ k: Number of documents to retrieve
74
+ search_type: "similarity" for standard search, "mmr" for diverse results
75
+
76
+ Returns:
77
+ Vectorstore retriever
78
+ """
79
+ if self.vectorstore is None:
80
+ raise ValueError("Vectorstore not initialized. Please create embeddings first.")
81
+
82
+ if search_type == "mmr":
83
+ # MMR for diversity - reduces redundancy in results
84
+ return self.vectorstore.as_retriever(
85
+ search_type="mmr",
86
+ search_kwargs={
87
+ "k": k,
88
+ "fetch_k": k * 5, # Fetch more candidates for diversity
89
+ "lambda_mult": 0.7 # Balance: 1.0=relevance, 0.0=diversity
90
+ }
91
+ )
92
+ else:
93
+ # Standard similarity search
94
+ return self.vectorstore.as_retriever(search_kwargs={"k": k})
95
+
96
+ def get_hybrid_retriever(self, k: int = 4, weights: List[float] = None):
97
+ """
98
+ Get hybrid retriever combining BM25 (sparse) and vector (dense) search.
99
+
100
+ Args:
101
+ k: Number of documents to retrieve
102
+ weights: [bm25_weight, vector_weight]. Default: [0.5, 0.5]
103
+
104
+ Returns:
105
+ SimpleEnsembleRetriever combining both approaches
106
+ """
107
+ if self.vectorstore is None:
108
+ raise ValueError("Vectorstore not initialized. Please create embeddings first.")
109
+
110
+ weights = weights or [0.5, 0.5]
111
+
112
+ bm25_retriever = self.get_bm25_retriever(k=k)
113
+ dense_retriever = self.get_dense_retriever(k=k)
114
+
115
+ return SimpleEnsembleRetriever(
116
+ retrievers=[bm25_retriever, dense_retriever],
117
+ weights=weights
118
+ )
119
+
120
+ def retrieve(
121
+ self,
122
+ query: str,
123
+ method: Literal["bm25", "similarity", "mmr", "hybrid"] = "hybrid",
124
+ k: int = 4,
125
+ **kwargs
126
+ ) -> List[Document]:
127
+ """
128
+ Retrieve relevant documents using specified method.
129
+
130
+ Args:
131
+ query: Search query
132
+ method: Retrieval strategy
133
+ - "bm25": Sparse keyword-based (no embeddings needed)
134
+ - "similarity": Dense vector similarity search
135
+ - "mmr": Maximal Marginal Relevance (diverse results)
136
+ - "hybrid": Combination of BM25 + vector search
137
+ k: Number of documents to retrieve
138
+ **kwargs: Additional arguments for specific retrievers
139
+
140
+ Returns:
141
+ List of relevant Document objects
142
+ """
143
+ if method == "bm25":
144
+ retriever = self.get_bm25_retriever(k=k)
145
+ elif method == "similarity":
146
+ retriever = self.get_dense_retriever(k=k, search_type="similarity")
147
+ elif method == "mmr":
148
+ retriever = self.get_dense_retriever(k=k, search_type="mmr")
149
+ elif method == "hybrid":
150
+ weights = kwargs.get("weights", [0.5, 0.5])
151
+ retriever = self.get_hybrid_retriever(k=k, weights=weights)
152
+ else:
153
+ raise ValueError(f"Unknown retrieval method: {method}")
154
+
155
+ return retriever.invoke(query)
156
+
157
+
158
+ if __name__ == "__main__":
159
+ from pathlib import Path
160
+ import sys
161
+
162
+ # Add src to path
163
+ sys.path.insert(0, str(Path(__file__).parent.parent))
164
+
165
+ from loader.ingest import load_upb_documents
166
+ from processing.chunking import chunk_documents
167
+
168
+ print("🚀 Loading and chunking documents...\n")
169
+ documents = load_upb_documents()
170
+ chunks = chunk_documents(documents)
171
+
172
+ print(f"✅ Loaded {len(chunks)} chunks\n")
173
+
174
+ # Initialize retriever (without vectorstore for BM25 demo)
175
+ retriever = UPBRetriever(chunks)
176
+
177
+ # Test BM25 retrieval
178
+ print("=" * 70)
179
+ print("TESTING BM25 RETRIEVAL (keyword-based)")
180
+ print("=" * 70)
181
+ query = "ingeniería de sistemas inteligencia artificial"
182
+ results = retriever.retrieve(query, method="bm25", k=3)
183
+
184
+ print(f"\nQuery: '{query}'")
185
+ print(f"Results: {len(results)} documents\n")
186
+
187
+ for i, doc in enumerate(results, 1):
188
+ print(f"Result {i}:")
189
+ print(f" Category: {doc.metadata.get('category', 'N/A')}")
190
+ print(f" Preview: {doc.page_content[:150]}...")
191
+ print()
192
+
193
+ print("✨ Retrieval module ready!")
194
+ print("\nNote: For similarity, MMR, and hybrid search, initialize with a vectorstore.")
uv.lock CHANGED
@@ -791,6 +791,18 @@ wheels = [
791
  { url = "https://files.pythonhosted.org/packages/1a/08/67bd04656199bbb51dbed1439b7f27601dfb576fb864099c7ef0c3e55531/pyyaml-6.0.3-cp312-cp312-win_arm64.whl", hash = "sha256:64386e5e707d03a7e172c0701abfb7e10f0fb753ee1d773128192742712a98fd", size = 140344, upload-time = "2025-09-25T21:32:22.617Z" },
792
  ]
793
 
 
 
 
 
 
 
 
 
 
 
 
 
794
  [[package]]
795
  name = "regex"
796
  version = "2025.10.23"
@@ -967,6 +979,7 @@ dependencies = [
967
  { name = "langchain-text-splitters" },
968
  { name = "playwright" },
969
  { name = "python-dotenv" },
 
970
  ]
971
 
972
  [package.metadata]
@@ -980,6 +993,7 @@ requires-dist = [
980
  { name = "langchain-text-splitters", specifier = ">=0.3.4" },
981
  { name = "playwright", specifier = ">=1.55.0" },
982
  { name = "python-dotenv", specifier = ">=1.0.0" },
 
983
  ]
984
 
985
  [[package]]
 
791
  { url = "https://files.pythonhosted.org/packages/1a/08/67bd04656199bbb51dbed1439b7f27601dfb576fb864099c7ef0c3e55531/pyyaml-6.0.3-cp312-cp312-win_arm64.whl", hash = "sha256:64386e5e707d03a7e172c0701abfb7e10f0fb753ee1d773128192742712a98fd", size = 140344, upload-time = "2025-09-25T21:32:22.617Z" },
792
  ]
793
 
794
+ [[package]]
795
+ name = "rank-bm25"
796
+ version = "0.2.2"
797
+ source = { registry = "https://pypi.org/simple" }
798
+ dependencies = [
799
+ { name = "numpy" },
800
+ ]
801
+ sdist = { url = "https://files.pythonhosted.org/packages/fc/0a/f9579384aa017d8b4c15613f86954b92a95a93d641cc849182467cf0bb3b/rank_bm25-0.2.2.tar.gz", hash = "sha256:096ccef76f8188563419aaf384a02f0ea459503fdf77901378d4fd9d87e5e51d", size = 8347, upload-time = "2022-02-16T12:10:52.196Z" }
802
+ wheels = [
803
+ { url = "https://files.pythonhosted.org/packages/2a/21/f691fb2613100a62b3fa91e9988c991e9ca5b89ea31c0d3152a3210344f9/rank_bm25-0.2.2-py3-none-any.whl", hash = "sha256:7bd4a95571adadfc271746fa146a4bcfd89c0cf731e49c3d1ad863290adbe8ae", size = 8584, upload-time = "2022-02-16T12:10:50.626Z" },
804
+ ]
805
+
806
  [[package]]
807
  name = "regex"
808
  version = "2025.10.23"
 
979
  { name = "langchain-text-splitters" },
980
  { name = "playwright" },
981
  { name = "python-dotenv" },
982
+ { name = "rank-bm25" },
983
  ]
984
 
985
  [package.metadata]
 
993
  { name = "langchain-text-splitters", specifier = ">=0.3.4" },
994
  { name = "playwright", specifier = ">=1.55.0" },
995
  { name = "python-dotenv", specifier = ">=1.0.0" },
996
+ { name = "rank-bm25", specifier = ">=0.2.2" },
997
  ]
998
 
999
  [[package]]