import json import os from llama_index.core.node_parser import SemanticSplitterNodeParser from llama_index.core.schema import Document from llama_index.embeddings.huggingface import HuggingFaceEmbedding from config import MODEL_NAME BUFFER_SIZE = 3 BREAKPOINT_PERCENTILE_THRESHOLD = 87 NUMBER_OF_ARTICLES = 86 INPUT_FOLDER = "extracted_content" OUTPUT_FILE = "chunks.jsonl" def load_articles(json_path="articles.json", n=None): """Load articles from JSON file. Optionally load only first N articles.""" with open(json_path, "r", encoding="utf-8") as f: articles = json.load(f) return articles[:n] if n else articles def chunk_text_semantic(text, embed_model): """Chunk text using semantic similarity with sentence buffer for overlap.""" splitter = SemanticSplitterNodeParser( embed_model=embed_model, buffer_size=BUFFER_SIZE, breakpoint_percentile_threshold=BREAKPOINT_PERCENTILE_THRESHOLD ) doc = Document(text=text) nodes = splitter.get_nodes_from_documents([doc]) return [node.text for node in nodes] def make_jsonl(articles, out_path="chunks.jsonl"): """Create JSONL with semantic chunks from multiple articles.""" print("Loading embedding model for semantic chunking...") embed_model = HuggingFaceEmbedding(model_name=MODEL_NAME) with open(out_path, "w", encoding="utf-8") as f: for idx, article in enumerate(articles, 1): print(f"Chunking ({idx}/{len(articles)}): {article['title']}") chunks = chunk_text_semantic(article["text"], embed_model) for i, chunk in enumerate(chunks, 1): record = { "url": article["url"], "title": article["title"], "date": article.get("date"), "chunk_id": i, "text": chunk, } f.write(json.dumps(record, ensure_ascii=False) + "\n") def chunk_from_json_files(input_folder=INPUT_FOLDER, output_file=OUTPUT_FILE): """Load articles from JSON files in folder and chunk them to JSONL.""" if not os.path.exists(input_folder): print(f"Input folder '{input_folder}' not found") return # Load all articles from JSON files all_articles = [] json_files = [f for f in os.listdir(input_folder) if f.endswith('.json')] if not json_files: print(f"No JSON files found in {input_folder}") return for json_file in json_files: json_path = os.path.join(input_folder, json_file) with open(json_path, "r", encoding="utf-8") as f: articles = json.load(f) all_articles.extend(articles) print(f"Loaded {len(articles)} articles from {json_file}") if not all_articles: print("No articles found to chunk") return print(f"\nTotal articles to chunk: {len(all_articles)}") print("Loading embedding model for semantic chunking...") embed_model = HuggingFaceEmbedding(model_name=MODEL_NAME) chunk_count = 0 with open(output_file, "w", encoding="utf-8") as f: for idx, article in enumerate(all_articles, 1): print(f"Chunking ({idx}/{len(all_articles)}): {article['title']}") chunks = chunk_text_semantic(article["text"], embed_model) for i, chunk in enumerate(chunks, 1): record = { "url": article["url"], "title": article["title"], "date": article.get("date"), "chunk_id": i, "text": chunk, } chunk_count += 1 f.write(json.dumps(record, ensure_ascii=False) + "\n") print(f"\nāœ“ Created {chunk_count} chunks from {len(all_articles)} articles") print(f"šŸ’¾ Saved to {output_file}") def main(): articles = load_articles(n=NUMBER_OF_ARTICLES) if not articles: print("No articles found in articles.json") return make_jsonl(articles) print(f"Chunks from {len(articles)} articles written to chunks.jsonl") if __name__ == "__main__": main()