Spaces:

Al1Abdullah
/

Ali_Chatbot

Sleeping

Ali_Chatbot / web_scraper.py

Ali Abdullah

Update web_scraper.py

aec7049 verified 7 months ago

4.48 kB

	import requests
	from bs4 import BeautifulSoup
	import re
	from typing import List, Dict
	from urllib.parse import urljoin, urlparse
	import time
	import nltk
	from nltk.tokenize import sent_tokenize

	# Download required NLTK data
	try:
	nltk.data.find('tokenizers/punkt')
	except LookupError:
	nltk.download('punkt')

	from newspaper import Article

	class WebScraper:
	def __init__(self, delay: float = 1.0):
	self.delay = delay

	def scrape_article(self, url: str) -> Dict[str, str]:
	try:
	article = Article(url)
	article.download()
	article.parse()

	return {
	'url': url,
	'title': article.title or 'Untitled',
	'content': article.text,
	'word_count': len(article.text.split()),
	'char_count': len(article.text)
	}

	except Exception as e:
	return {
	'url': url,
	'title': '',
	'content': '',
	'error': str(e),
	'word_count': 0,
	'char_count': 0
	}

	class TextChunker:
	def __init__(self, chunk_size: int = 100, overlap: int = 20):
	"""
	Initialize text chunker
	Args:
	chunk_size: Maximum tokens per chunk
	overlap: Overlap between chunks
	"""
	self.chunk_size = chunk_size
	self.overlap = overlap

	def chunk_text(self, text: str, metadata: Dict = None) -> List[Dict]:
	"""
	Split text into overlapping chunks
	Args:
	text: Text to chunk
	metadata: Additional metadata to include
	Returns:
	List of chunk dictionaries
	"""
	if not text.strip():
	return []

	# Use sentence tokenization for better chunk boundaries
	sentences = sent_tokenize(text)
	chunks = []
	current_chunk = []
	current_length = 0

	for sentence in sentences:
	sentence_length = len(sentence.split())

	# If adding this sentence would exceed chunk size, create a new chunk
	if current_length + sentence_length > self.chunk_size and current_chunk:
	chunk_text = ' '.join(current_chunk)
	print(f"📄 Chunk {len(chunks)}:\n{text[:150]}...\n")
	chunks.append(self._create_chunk_dict(chunk_text, metadata, len(chunks)))

	# Start new chunk with overlap
	overlap_sentences = current_chunk[-self.overlap//10:] if len(current_chunk) >= self.overlap//10 else current_chunk
	current_chunk = overlap_sentences + [sentence]
	current_length = sum(len(s.split()) for s in current_chunk)
	else:
	current_chunk.append(sentence)
	current_length += sentence_length

	# Add the last chunk
	if current_chunk:
	chunk_text = ' '.join(current_chunk)
	chunks.append(self._create_chunk_dict(chunk_text, metadata, len(chunks)))

	return chunks

	def _create_chunk_dict(self, text: str, metadata: Dict, chunk_id: int) -> Dict:
	"""Create a chunk dictionary with metadata"""
	chunk_dict = {
	'chunk_id': chunk_id,
	'text': text,
	'word_count': len(text.split()),
	'char_count': len(text)
	}

	if metadata:
	chunk_dict.update(metadata)

	return chunk_dict

	# Example usage
	if __name__ == "__main__":
	# Test the scraper
	scraper = WebScraper()
	chunker = TextChunker()

	# Test URL (replace with your target URL)
	test_url = "https://medium.com/@aminajavaid30/building-a-rag-system-the-data-ingestion-pipeline-d04235fd17ea"

	# Scrape content
	article_data = scraper.scrape_article(test_url)
	print(f"Title: {article_data['title']}")
	print(f"Content length: {article_data['word_count']} words")

	# Create chunks
	if article_data['content']:
	chunks = chunker.chunk_text(
	article_data['content'],
	metadata={
	'url': article_data['url'],
	'title': article_data['title']
	}
	)
	print(f"Created {len(chunks)} chunks")

	# Show first chunk
	if chunks:
	print(f"First chunk: {chunks[0]['text'][:200]}...")