File size: 4,476 Bytes
063051a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aec7049
063051a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aec7049
063051a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56b13cf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import requests
from bs4 import BeautifulSoup
import re
from typing import List, Dict
from urllib.parse import urljoin, urlparse
import time
import nltk
from nltk.tokenize import sent_tokenize

# Download required NLTK data
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

from newspaper import Article

class WebScraper:
    def __init__(self, delay: float = 1.0):
        self.delay = delay

    def scrape_article(self, url: str) -> Dict[str, str]:
        try:
            article = Article(url)
            article.download()
            article.parse()

            return {
                'url': url,
                'title': article.title or 'Untitled',
                'content': article.text,
                'word_count': len(article.text.split()),
                'char_count': len(article.text)
            }

        except Exception as e:
            return {
                'url': url,
                'title': '',
                'content': '',
                'error': str(e),
                'word_count': 0,
                'char_count': 0
            }

class TextChunker:
    def __init__(self, chunk_size: int = 100, overlap: int = 20):
        """
        Initialize text chunker
        Args:
            chunk_size: Maximum tokens per chunk
            overlap: Overlap between chunks
        """
        self.chunk_size = chunk_size
        self.overlap = overlap
    
    def chunk_text(self, text: str, metadata: Dict = None) -> List[Dict]:
        """
        Split text into overlapping chunks
        Args:
            text: Text to chunk
            metadata: Additional metadata to include
        Returns:
            List of chunk dictionaries
        """
        if not text.strip():
            return []
        
        # Use sentence tokenization for better chunk boundaries
        sentences = sent_tokenize(text)
        chunks = []
        current_chunk = []
        current_length = 0
        
        for sentence in sentences:
            sentence_length = len(sentence.split())
            
            # If adding this sentence would exceed chunk size, create a new chunk
            if current_length + sentence_length > self.chunk_size and current_chunk:
                chunk_text = ' '.join(current_chunk)
                print(f"📄 Chunk {len(chunks)}:\n{text[:150]}...\n")
                chunks.append(self._create_chunk_dict(chunk_text, metadata, len(chunks)))
                
                # Start new chunk with overlap
                overlap_sentences = current_chunk[-self.overlap//10:] if len(current_chunk) >= self.overlap//10 else current_chunk
                current_chunk = overlap_sentences + [sentence]
                current_length = sum(len(s.split()) for s in current_chunk)
            else:
                current_chunk.append(sentence)
                current_length += sentence_length
        
        # Add the last chunk
        if current_chunk:
            chunk_text = ' '.join(current_chunk)
            chunks.append(self._create_chunk_dict(chunk_text, metadata, len(chunks)))
        
        return chunks
    
    def _create_chunk_dict(self, text: str, metadata: Dict, chunk_id: int) -> Dict:
        """Create a chunk dictionary with metadata"""
        chunk_dict = {
            'chunk_id': chunk_id,
            'text': text,
            'word_count': len(text.split()),
            'char_count': len(text)
        }
        
        if metadata:
            chunk_dict.update(metadata)
        
        return chunk_dict

# Example usage
if __name__ == "__main__":
    # Test the scraper
    scraper = WebScraper()
    chunker = TextChunker()
    
    # Test URL (replace with your target URL)
    test_url = "https://medium.com/@aminajavaid30/building-a-rag-system-the-data-ingestion-pipeline-d04235fd17ea"
    
    # Scrape content
    article_data = scraper.scrape_article(test_url)
    print(f"Title: {article_data['title']}")
    print(f"Content length: {article_data['word_count']} words")
    
    # Create chunks
    if article_data['content']:
        chunks = chunker.chunk_text(
            article_data['content'], 
            metadata={
                'url': article_data['url'],
                'title': article_data['title']
            }
        )
        print(f"Created {len(chunks)} chunks")
        
        # Show first chunk
        if chunks:
            print(f"First chunk: {chunks[0]['text'][:200]}...")