80000_Hours_AI_Assistant / chunk_articles_cli.py
Ryan
- add all content to vector db
99a81ef
import json
import os
from llama_index.core.node_parser import SemanticSplitterNodeParser
from llama_index.core.schema import Document
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from config import MODEL_NAME
BUFFER_SIZE = 3
BREAKPOINT_PERCENTILE_THRESHOLD = 87
NUMBER_OF_ARTICLES = 86
INPUT_FOLDER = "extracted_content"
OUTPUT_FILE = "chunks.jsonl"
def load_articles(json_path="articles.json", n=None):
"""Load articles from JSON file. Optionally load only first N articles."""
with open(json_path, "r", encoding="utf-8") as f:
articles = json.load(f)
return articles[:n] if n else articles
def chunk_text_semantic(text, embed_model):
"""Chunk text using semantic similarity with sentence buffer for overlap."""
splitter = SemanticSplitterNodeParser(
embed_model=embed_model,
buffer_size=BUFFER_SIZE,
breakpoint_percentile_threshold=BREAKPOINT_PERCENTILE_THRESHOLD
)
doc = Document(text=text)
nodes = splitter.get_nodes_from_documents([doc])
return [node.text for node in nodes]
def make_jsonl(articles, out_path="chunks.jsonl"):
"""Create JSONL with semantic chunks from multiple articles."""
print("Loading embedding model for semantic chunking...")
embed_model = HuggingFaceEmbedding(model_name=MODEL_NAME)
with open(out_path, "w", encoding="utf-8") as f:
for idx, article in enumerate(articles, 1):
print(f"Chunking ({idx}/{len(articles)}): {article['title']}")
chunks = chunk_text_semantic(article["text"], embed_model)
for i, chunk in enumerate(chunks, 1):
record = {
"url": article["url"],
"title": article["title"],
"date": article.get("date"),
"chunk_id": i,
"text": chunk,
}
f.write(json.dumps(record, ensure_ascii=False) + "\n")
def chunk_from_json_files(input_folder=INPUT_FOLDER, output_file=OUTPUT_FILE):
"""Load articles from JSON files in folder and chunk them to JSONL."""
if not os.path.exists(input_folder):
print(f"Input folder '{input_folder}' not found")
return
# Load all articles from JSON files
all_articles = []
json_files = [f for f in os.listdir(input_folder) if f.endswith('.json')]
if not json_files:
print(f"No JSON files found in {input_folder}")
return
for json_file in json_files:
json_path = os.path.join(input_folder, json_file)
with open(json_path, "r", encoding="utf-8") as f:
articles = json.load(f)
all_articles.extend(articles)
print(f"Loaded {len(articles)} articles from {json_file}")
if not all_articles:
print("No articles found to chunk")
return
print(f"\nTotal articles to chunk: {len(all_articles)}")
print("Loading embedding model for semantic chunking...")
embed_model = HuggingFaceEmbedding(model_name=MODEL_NAME)
chunk_count = 0
with open(output_file, "w", encoding="utf-8") as f:
for idx, article in enumerate(all_articles, 1):
print(f"Chunking ({idx}/{len(all_articles)}): {article['title']}")
chunks = chunk_text_semantic(article["text"], embed_model)
for i, chunk in enumerate(chunks, 1):
record = {
"url": article["url"],
"title": article["title"],
"date": article.get("date"),
"chunk_id": i,
"text": chunk,
}
chunk_count += 1
f.write(json.dumps(record, ensure_ascii=False) + "\n")
print(f"\n✓ Created {chunk_count} chunks from {len(all_articles)} articles")
print(f"💾 Saved to {output_file}")
def main():
articles = load_articles(n=NUMBER_OF_ARTICLES)
if not articles:
print("No articles found in articles.json")
return
make_jsonl(articles)
print(f"Chunks from {len(articles)} articles written to chunks.jsonl")
if __name__ == "__main__":
main()