File size: 2,825 Bytes
20e57f9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import requests
from bs4 import BeautifulSoup
import time

def read_file(file_path):
    """

    Read text content from a file.

    

    Args:

        file_path (str): Path to the text file

        

    Returns:

        str: File content

    """
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read().strip()
            if not content:
                raise Exception("File is empty")
            return content
    except UnicodeDecodeError:
        # Try with different encodings if utf-8 fails
        try:
            with open(file_path, 'r', encoding='latin-1') as f:
                content = f.read().strip()
                if not content:
                    raise Exception("File is empty")
                return content
        except Exception as e:
            raise Exception(f"Failed to read file with alternative encoding: {str(e)}")
    except Exception as e:
        raise Exception(f"File reading failed: {str(e)}")

def extract_from_url(url):
    """

    Extract text content from a URL.

    

    Args:

        url (str): URL to extract text from

        

    Returns:

        str: Extracted text content

    """
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        
        # Add retry mechanism
        max_retries = 3
        for attempt in range(max_retries):
            try:
                response = requests.get(url, headers=headers, timeout=10)
                response.raise_for_status()
                break
            except requests.RequestException as e:
                if attempt == max_retries - 1:
                    raise
                time.sleep(1)
        
        soup = BeautifulSoup(response.text, 'html.parser')
        # Try to get text from articles first
        article_text = ""
        articles = soup.find_all(['article', 'main'])
        if articles:
            for article in articles:
                paragraphs = article.find_all("p")
                article_text += " ".join(p.text.strip() for p in paragraphs if p.text.strip())
        
        # If no article text found, fall back to all paragraphs
        if not article_text:
            paragraphs = soup.find_all("p")
            article_text = " ".join(p.text.strip() for p in paragraphs if p.text.strip())
        
        if not article_text:
            raise Exception("No text content found on the page")
            
        return article_text
    except requests.RequestException as e:
        raise Exception(f"Failed to fetch URL: {str(e)}")
    except Exception as e:
        raise Exception(f"URL extraction failed: {str(e)}")