import requests from bs4 import BeautifulSoup import re from typing import List, Dict from urllib.parse import urljoin, urlparse import time import nltk from nltk.tokenize import sent_tokenize # Download required NLTK data try: nltk.data.find('tokenizers/punkt') except LookupError: nltk.download('punkt') from newspaper import Article class WebScraper: def __init__(self, delay: float = 1.0): self.delay = delay def scrape_article(self, url: str) -> Dict[str, str]: try: article = Article(url) article.download() article.parse() return { 'url': url, 'title': article.title or 'Untitled', 'content': article.text, 'word_count': len(article.text.split()), 'char_count': len(article.text) } except Exception as e: return { 'url': url, 'title': '', 'content': '', 'error': str(e), 'word_count': 0, 'char_count': 0 } class TextChunker: def __init__(self, chunk_size: int = 100, overlap: int = 20): """ Initialize text chunker Args: chunk_size: Maximum tokens per chunk overlap: Overlap between chunks """ self.chunk_size = chunk_size self.overlap = overlap def chunk_text(self, text: str, metadata: Dict = None) -> List[Dict]: """ Split text into overlapping chunks Args: text: Text to chunk metadata: Additional metadata to include Returns: List of chunk dictionaries """ if not text.strip(): return [] # Use sentence tokenization for better chunk boundaries sentences = sent_tokenize(text) chunks = [] current_chunk = [] current_length = 0 for sentence in sentences: sentence_length = len(sentence.split()) # If adding this sentence would exceed chunk size, create a new chunk if current_length + sentence_length > self.chunk_size and current_chunk: chunk_text = ' '.join(current_chunk) print(f"📄 Chunk {len(chunks)}:\n{text[:150]}...\n") chunks.append(self._create_chunk_dict(chunk_text, metadata, len(chunks))) # Start new chunk with overlap overlap_sentences = current_chunk[-self.overlap//10:] if len(current_chunk) >= self.overlap//10 else current_chunk current_chunk = overlap_sentences + [sentence] current_length = sum(len(s.split()) for s in current_chunk) else: current_chunk.append(sentence) current_length += sentence_length # Add the last chunk if current_chunk: chunk_text = ' '.join(current_chunk) chunks.append(self._create_chunk_dict(chunk_text, metadata, len(chunks))) return chunks def _create_chunk_dict(self, text: str, metadata: Dict, chunk_id: int) -> Dict: """Create a chunk dictionary with metadata""" chunk_dict = { 'chunk_id': chunk_id, 'text': text, 'word_count': len(text.split()), 'char_count': len(text) } if metadata: chunk_dict.update(metadata) return chunk_dict # Example usage if __name__ == "__main__": # Test the scraper scraper = WebScraper() chunker = TextChunker() # Test URL (replace with your target URL) test_url = "https://medium.com/@aminajavaid30/building-a-rag-system-the-data-ingestion-pipeline-d04235fd17ea" # Scrape content article_data = scraper.scrape_article(test_url) print(f"Title: {article_data['title']}") print(f"Content length: {article_data['word_count']} words") # Create chunks if article_data['content']: chunks = chunker.chunk_text( article_data['content'], metadata={ 'url': article_data['url'], 'title': article_data['title'] } ) print(f"Created {len(chunks)} chunks") # Show first chunk if chunks: print(f"First chunk: {chunks[0]['text'][:200]}...")