Spaces:

Al1Abdullah
/

Ali_Chatbot

Sleeping

File size: 4,476 Bytes

import requests
from bs4 import BeautifulSoup
import re
from typing import List, Dict
from urllib.parse import urljoin, urlparse
import time
import nltk
from nltk.tokenize import sent_tokenize

# Download required NLTK data
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

from newspaper import Article

class WebScraper:
    def __init__(self, delay: float = 1.0):
        self.delay = delay

    def scrape_article(self, url: str) -> Dict[str, str]:
        try:
            article = Article(url)
            article.download()
            article.parse()

            return {
                'url': url,
                'title': article.title or 'Untitled',
                'content': article.text,
                'word_count': len(article.text.split()),
                'char_count': len(article.text)
            }

        except Exception as e:
            return {
                'url': url,
                'title': '',
                'content': '',
                'error': str(e),
                'word_count': 0,
                'char_count': 0
            }

class TextChunker:
    def __init__(self, chunk_size: int = 100, overlap: int = 20):
        """
        Initialize text chunker
        Args:
            chunk_size: Maximum tokens per chunk
            overlap: Overlap between chunks
        """
        self.chunk_size = chunk_size
        self.overlap = overlap
    
    def chunk_text(self, text: str, metadata: Dict = None) -> List[Dict]:
        """
        Split text into overlapping chunks
        Args:
            text: Text to chunk
            metadata: Additional metadata to include
        Returns:
            List of chunk dictionaries
        """
        if not text.strip():
            return []
        
        # Use sentence tokenization for better chunk boundaries
        sentences = sent_tokenize(text)
        chunks = []
        current_chunk = []
        current_length = 0
        
        for sentence in sentences:
            sentence_length = len(sentence.split())
            
            # If adding this sentence would exceed chunk size, create a new chunk
            if current_length + sentence_length > self.chunk_size and current_chunk:
                chunk_text = ' '.join(current_chunk)
                print(f"📄 Chunk {len(chunks)}:\n{text[:150]}...\n")
                chunks.append(self._create_chunk_dict(chunk_text, metadata, len(chunks)))
                
                # Start new chunk with overlap
                overlap_sentences = current_chunk[-self.overlap//10:] if len(current_chunk) >= self.overlap//10 else current_chunk
                current_chunk = overlap_sentences + [sentence]
                current_length = sum(len(s.split()) for s in current_chunk)
            else:
                current_chunk.append(sentence)
                current_length += sentence_length
        
        # Add the last chunk
        if current_chunk:
            chunk_text = ' '.join(current_chunk)
            chunks.append(self._create_chunk_dict(chunk_text, metadata, len(chunks)))
        
        return chunks
    
    def _create_chunk_dict(self, text: str, metadata: Dict, chunk_id: int) -> Dict:
        """Create a chunk dictionary with metadata"""
        chunk_dict = {
            'chunk_id': chunk_id,
            'text': text,
            'word_count': len(text.split()),
            'char_count': len(text)
        }
        
        if metadata:
            chunk_dict.update(metadata)
        
        return chunk_dict

# Example usage
if __name__ == "__main__":
    # Test the scraper
    scraper = WebScraper()
    chunker = TextChunker()
    
    # Test URL (replace with your target URL)
    test_url = "https://medium.com/@aminajavaid30/building-a-rag-system-the-data-ingestion-pipeline-d04235fd17ea"
    
    # Scrape content
    article_data = scraper.scrape_article(test_url)
    print(f"Title: {article_data['title']}")
    print(f"Content length: {article_data['word_count']} words")
    
    # Create chunks
    if article_data['content']:
        chunks = chunker.chunk_text(
            article_data['content'], 
            metadata={
                'url': article_data['url'],
                'title': article_data['title']
            }
        )
        print(f"Created {len(chunks)} chunks")
        
        # Show first chunk
        if chunks:
            print(f"First chunk: {chunks[0]['text'][:200]}...")