Spaces:
Sleeping
Sleeping
| import requests | |
| from bs4 import BeautifulSoup | |
| import re | |
| from typing import List, Dict | |
| from urllib.parse import urljoin, urlparse | |
| import time | |
| import nltk | |
| from nltk.tokenize import sent_tokenize | |
| # Download required NLTK data | |
| try: | |
| nltk.data.find('tokenizers/punkt') | |
| except LookupError: | |
| nltk.download('punkt') | |
| from newspaper import Article | |
| class WebScraper: | |
| def __init__(self, delay: float = 1.0): | |
| self.delay = delay | |
| def scrape_article(self, url: str) -> Dict[str, str]: | |
| try: | |
| article = Article(url) | |
| article.download() | |
| article.parse() | |
| return { | |
| 'url': url, | |
| 'title': article.title or 'Untitled', | |
| 'content': article.text, | |
| 'word_count': len(article.text.split()), | |
| 'char_count': len(article.text) | |
| } | |
| except Exception as e: | |
| return { | |
| 'url': url, | |
| 'title': '', | |
| 'content': '', | |
| 'error': str(e), | |
| 'word_count': 0, | |
| 'char_count': 0 | |
| } | |
| class TextChunker: | |
| def __init__(self, chunk_size: int = 100, overlap: int = 20): | |
| """ | |
| Initialize text chunker | |
| Args: | |
| chunk_size: Maximum tokens per chunk | |
| overlap: Overlap between chunks | |
| """ | |
| self.chunk_size = chunk_size | |
| self.overlap = overlap | |
| def chunk_text(self, text: str, metadata: Dict = None) -> List[Dict]: | |
| """ | |
| Split text into overlapping chunks | |
| Args: | |
| text: Text to chunk | |
| metadata: Additional metadata to include | |
| Returns: | |
| List of chunk dictionaries | |
| """ | |
| if not text.strip(): | |
| return [] | |
| # Use sentence tokenization for better chunk boundaries | |
| sentences = sent_tokenize(text) | |
| chunks = [] | |
| current_chunk = [] | |
| current_length = 0 | |
| for sentence in sentences: | |
| sentence_length = len(sentence.split()) | |
| # If adding this sentence would exceed chunk size, create a new chunk | |
| if current_length + sentence_length > self.chunk_size and current_chunk: | |
| chunk_text = ' '.join(current_chunk) | |
| print(f"๐ Chunk {len(chunks)}:\n{text[:150]}...\n") | |
| chunks.append(self._create_chunk_dict(chunk_text, metadata, len(chunks))) | |
| # Start new chunk with overlap | |
| overlap_sentences = current_chunk[-self.overlap//10:] if len(current_chunk) >= self.overlap//10 else current_chunk | |
| current_chunk = overlap_sentences + [sentence] | |
| current_length = sum(len(s.split()) for s in current_chunk) | |
| else: | |
| current_chunk.append(sentence) | |
| current_length += sentence_length | |
| # Add the last chunk | |
| if current_chunk: | |
| chunk_text = ' '.join(current_chunk) | |
| chunks.append(self._create_chunk_dict(chunk_text, metadata, len(chunks))) | |
| return chunks | |
| def _create_chunk_dict(self, text: str, metadata: Dict, chunk_id: int) -> Dict: | |
| """Create a chunk dictionary with metadata""" | |
| chunk_dict = { | |
| 'chunk_id': chunk_id, | |
| 'text': text, | |
| 'word_count': len(text.split()), | |
| 'char_count': len(text) | |
| } | |
| if metadata: | |
| chunk_dict.update(metadata) | |
| return chunk_dict | |
| # Example usage | |
| if __name__ == "__main__": | |
| # Test the scraper | |
| scraper = WebScraper() | |
| chunker = TextChunker() | |
| # Test URL (replace with your target URL) | |
| test_url = "https://medium.com/@aminajavaid30/building-a-rag-system-the-data-ingestion-pipeline-d04235fd17ea" | |
| # Scrape content | |
| article_data = scraper.scrape_article(test_url) | |
| print(f"Title: {article_data['title']}") | |
| print(f"Content length: {article_data['word_count']} words") | |
| # Create chunks | |
| if article_data['content']: | |
| chunks = chunker.chunk_text( | |
| article_data['content'], | |
| metadata={ | |
| 'url': article_data['url'], | |
| 'title': article_data['title'] | |
| } | |
| ) | |
| print(f"Created {len(chunks)} chunks") | |
| # Show first chunk | |
| if chunks: | |
| print(f"First chunk: {chunks[0]['text'][:200]}...") |