File size: 2,362 Bytes
97f9138
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import requests
from bs4 import BeautifulSoup
from typing import List, Dict
import re
from src.config import config

class TextProcessor:
    def __init__(self, chunk_size: int = None, chunk_overlap: int = None):
        self.chunk_size = chunk_size or config.CHUNK_SIZE
        self.chunk_overlap = chunk_overlap or config.CHUNK_OVERLAP
    
    def fetch_webpage(self, url: str, timeout: int = 30) -> str:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }
        response = requests.get(url, headers=headers, timeout=timeout)
        response.raise_for_status()
        return response.text
    
    def clean_html(self, html: str) -> str:
        soup = BeautifulSoup(html, 'html.parser')
        
        for script in soup(["script", "style", "nav", "footer", "header", "aside"]):
            script.decompose()
        
        text = soup.get_text()
        
        lines = (line.strip() for line in text.splitlines())
        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
        text = ' '.join(chunk for chunk in chunks if chunk)
        
        text = re.sub(r'\s+', ' ', text)
        
        return text.strip()
    
    def chunk_text(self, text: str) -> List[Dict[str, any]]:
        if not text:
            return []
        
        words = text.split()
        chunks = []
        
        i = 0
        chunk_id = 0
        while i < len(words):
            chunk_words = words[i:i + self.chunk_size]
            chunk_text = ' '.join(chunk_words)
            
            chunks.append({
                "id": chunk_id,
                "text": chunk_text,
                "start_word": i,
                "end_word": min(i + self.chunk_size, len(words))
            })
            
            i += self.chunk_size - self.chunk_overlap
            chunk_id += 1
        
        return chunks
    
    def process_url(self, url: str) -> Dict:
        html = self.fetch_webpage(url)
        
        clean_text = self.clean_html(html)
        
        chunks = self.chunk_text(clean_text)
        
        return {
            "url": url,
            "full_text": clean_text,
            "chunks": chunks,
            "chunk_count": len(chunks),
            "word_count": len(clean_text.split())
        }

text_processor = TextProcessor()