Spaces:
Sleeping
Sleeping
File size: 4,476 Bytes
063051a aec7049 063051a aec7049 063051a 56b13cf |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 |
import requests
from bs4 import BeautifulSoup
import re
from typing import List, Dict
from urllib.parse import urljoin, urlparse
import time
import nltk
from nltk.tokenize import sent_tokenize
# Download required NLTK data
try:
nltk.data.find('tokenizers/punkt')
except LookupError:
nltk.download('punkt')
from newspaper import Article
class WebScraper:
def __init__(self, delay: float = 1.0):
self.delay = delay
def scrape_article(self, url: str) -> Dict[str, str]:
try:
article = Article(url)
article.download()
article.parse()
return {
'url': url,
'title': article.title or 'Untitled',
'content': article.text,
'word_count': len(article.text.split()),
'char_count': len(article.text)
}
except Exception as e:
return {
'url': url,
'title': '',
'content': '',
'error': str(e),
'word_count': 0,
'char_count': 0
}
class TextChunker:
def __init__(self, chunk_size: int = 100, overlap: int = 20):
"""
Initialize text chunker
Args:
chunk_size: Maximum tokens per chunk
overlap: Overlap between chunks
"""
self.chunk_size = chunk_size
self.overlap = overlap
def chunk_text(self, text: str, metadata: Dict = None) -> List[Dict]:
"""
Split text into overlapping chunks
Args:
text: Text to chunk
metadata: Additional metadata to include
Returns:
List of chunk dictionaries
"""
if not text.strip():
return []
# Use sentence tokenization for better chunk boundaries
sentences = sent_tokenize(text)
chunks = []
current_chunk = []
current_length = 0
for sentence in sentences:
sentence_length = len(sentence.split())
# If adding this sentence would exceed chunk size, create a new chunk
if current_length + sentence_length > self.chunk_size and current_chunk:
chunk_text = ' '.join(current_chunk)
print(f"📄 Chunk {len(chunks)}:\n{text[:150]}...\n")
chunks.append(self._create_chunk_dict(chunk_text, metadata, len(chunks)))
# Start new chunk with overlap
overlap_sentences = current_chunk[-self.overlap//10:] if len(current_chunk) >= self.overlap//10 else current_chunk
current_chunk = overlap_sentences + [sentence]
current_length = sum(len(s.split()) for s in current_chunk)
else:
current_chunk.append(sentence)
current_length += sentence_length
# Add the last chunk
if current_chunk:
chunk_text = ' '.join(current_chunk)
chunks.append(self._create_chunk_dict(chunk_text, metadata, len(chunks)))
return chunks
def _create_chunk_dict(self, text: str, metadata: Dict, chunk_id: int) -> Dict:
"""Create a chunk dictionary with metadata"""
chunk_dict = {
'chunk_id': chunk_id,
'text': text,
'word_count': len(text.split()),
'char_count': len(text)
}
if metadata:
chunk_dict.update(metadata)
return chunk_dict
# Example usage
if __name__ == "__main__":
# Test the scraper
scraper = WebScraper()
chunker = TextChunker()
# Test URL (replace with your target URL)
test_url = "https://medium.com/@aminajavaid30/building-a-rag-system-the-data-ingestion-pipeline-d04235fd17ea"
# Scrape content
article_data = scraper.scrape_article(test_url)
print(f"Title: {article_data['title']}")
print(f"Content length: {article_data['word_count']} words")
# Create chunks
if article_data['content']:
chunks = chunker.chunk_text(
article_data['content'],
metadata={
'url': article_data['url'],
'title': article_data['title']
}
)
print(f"Created {len(chunks)} chunks")
# Show first chunk
if chunks:
print(f"First chunk: {chunks[0]['text'][:200]}...") |