File size: 2,506 Bytes
959d1ac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import requests
from newspaper import Article
import pandas as pd
import time
from pathlib import Path
import re
from typing import List

def extract_article_text(urls):
    """Extract text content from a list of article URLs"""
    articles = []
    for url in urls:
        try:
            article = Article(url)
            article.download()
            article.parse()
            
            articles.append({
                'url': url,
                'title': article.title,
                'text': article.text,
                'authors': article.authors,
                'publish_date': article.publish_date,
                'source': url.split('/')[2]  # Extract domain
            })
            time.sleep(1)  # Be respectful to servers
        except Exception as e:
            print(f"Failed to extract {url}: {e}")
    
    return pd.DataFrame(articles)

def clean_text(text: str) -> str:
    """Clean text by removing extra whitespace and special characters"""
    # Remove extra whitespace, special characters
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s.,!?-]', '', text)
    return text.strip()

def chunk_text(text: str, chunk_size: int = 1000, overlap: int = 200) -> List[str]:
    """Split text into overlapping chunks"""
    words = text.split()
    chunks = []
    
    for i in range(0, len(words), chunk_size - overlap):
        chunk = ' '.join(words[i:i + chunk_size])
        chunks.append(chunk)
    
    return chunks

def process_articles(urls: List[str], save_path: str = None) -> pd.DataFrame:
    """Complete pipeline to extract, clean, and process articles"""
    print(f"Extracting text from {len(urls)} articles...")
    
    # Extract articles
    df = extract_article_text(urls)
    
    # Clean text
    df['cleaned_text'] = df['text'].apply(clean_text)
    
    # Create chunks for each article
    df['text_chunks'] = df['cleaned_text'].apply(
        lambda x: chunk_text(x) if pd.notna(x) else []
    )
    
    # Save if path provided
    if save_path:
        df.to_csv(save_path, index=False)
        print(f"Results saved to {save_path}")
    
    return df

if __name__ == "__main__":
    # Example usage
    sample_urls = [
        "https://example.com/article1",
        "https://example.com/article2"
    ]
    
    # Process articles
    # df = process_articles(sample_urls, "extracted_articles.csv")
    # print(f"Extracted {len(df)} articles")
    
    print("Article extractor ready! Use process_articles() with your URLs.")