Ali_Chatbot / web_scraper.py
Ali Abdullah
Update web_scraper.py
aec7049 verified
import requests
from bs4 import BeautifulSoup
import re
from typing import List, Dict
from urllib.parse import urljoin, urlparse
import time
import nltk
from nltk.tokenize import sent_tokenize
# Download required NLTK data
try:
nltk.data.find('tokenizers/punkt')
except LookupError:
nltk.download('punkt')
from newspaper import Article
class WebScraper:
def __init__(self, delay: float = 1.0):
self.delay = delay
def scrape_article(self, url: str) -> Dict[str, str]:
try:
article = Article(url)
article.download()
article.parse()
return {
'url': url,
'title': article.title or 'Untitled',
'content': article.text,
'word_count': len(article.text.split()),
'char_count': len(article.text)
}
except Exception as e:
return {
'url': url,
'title': '',
'content': '',
'error': str(e),
'word_count': 0,
'char_count': 0
}
class TextChunker:
def __init__(self, chunk_size: int = 100, overlap: int = 20):
"""
Initialize text chunker
Args:
chunk_size: Maximum tokens per chunk
overlap: Overlap between chunks
"""
self.chunk_size = chunk_size
self.overlap = overlap
def chunk_text(self, text: str, metadata: Dict = None) -> List[Dict]:
"""
Split text into overlapping chunks
Args:
text: Text to chunk
metadata: Additional metadata to include
Returns:
List of chunk dictionaries
"""
if not text.strip():
return []
# Use sentence tokenization for better chunk boundaries
sentences = sent_tokenize(text)
chunks = []
current_chunk = []
current_length = 0
for sentence in sentences:
sentence_length = len(sentence.split())
# If adding this sentence would exceed chunk size, create a new chunk
if current_length + sentence_length > self.chunk_size and current_chunk:
chunk_text = ' '.join(current_chunk)
print(f"๐Ÿ“„ Chunk {len(chunks)}:\n{text[:150]}...\n")
chunks.append(self._create_chunk_dict(chunk_text, metadata, len(chunks)))
# Start new chunk with overlap
overlap_sentences = current_chunk[-self.overlap//10:] if len(current_chunk) >= self.overlap//10 else current_chunk
current_chunk = overlap_sentences + [sentence]
current_length = sum(len(s.split()) for s in current_chunk)
else:
current_chunk.append(sentence)
current_length += sentence_length
# Add the last chunk
if current_chunk:
chunk_text = ' '.join(current_chunk)
chunks.append(self._create_chunk_dict(chunk_text, metadata, len(chunks)))
return chunks
def _create_chunk_dict(self, text: str, metadata: Dict, chunk_id: int) -> Dict:
"""Create a chunk dictionary with metadata"""
chunk_dict = {
'chunk_id': chunk_id,
'text': text,
'word_count': len(text.split()),
'char_count': len(text)
}
if metadata:
chunk_dict.update(metadata)
return chunk_dict
# Example usage
if __name__ == "__main__":
# Test the scraper
scraper = WebScraper()
chunker = TextChunker()
# Test URL (replace with your target URL)
test_url = "https://medium.com/@aminajavaid30/building-a-rag-system-the-data-ingestion-pipeline-d04235fd17ea"
# Scrape content
article_data = scraper.scrape_article(test_url)
print(f"Title: {article_data['title']}")
print(f"Content length: {article_data['word_count']} words")
# Create chunks
if article_data['content']:
chunks = chunker.chunk_text(
article_data['content'],
metadata={
'url': article_data['url'],
'title': article_data['title']
}
)
print(f"Created {len(chunks)} chunks")
# Show first chunk
if chunks:
print(f"First chunk: {chunks[0]['text'][:200]}...")