Par-ity_Project / article_extractor.py
chenemii's picture
Update Par-ity Project with enhanced features
959d1ac
raw
history blame
2.51 kB
import requests
from newspaper import Article
import pandas as pd
import time
from pathlib import Path
import re
from typing import List
def extract_article_text(urls):
"""Extract text content from a list of article URLs"""
articles = []
for url in urls:
try:
article = Article(url)
article.download()
article.parse()
articles.append({
'url': url,
'title': article.title,
'text': article.text,
'authors': article.authors,
'publish_date': article.publish_date,
'source': url.split('/')[2] # Extract domain
})
time.sleep(1) # Be respectful to servers
except Exception as e:
print(f"Failed to extract {url}: {e}")
return pd.DataFrame(articles)
def clean_text(text: str) -> str:
"""Clean text by removing extra whitespace and special characters"""
# Remove extra whitespace, special characters
text = re.sub(r'\s+', ' ', text)
text = re.sub(r'[^\w\s.,!?-]', '', text)
return text.strip()
def chunk_text(text: str, chunk_size: int = 1000, overlap: int = 200) -> List[str]:
"""Split text into overlapping chunks"""
words = text.split()
chunks = []
for i in range(0, len(words), chunk_size - overlap):
chunk = ' '.join(words[i:i + chunk_size])
chunks.append(chunk)
return chunks
def process_articles(urls: List[str], save_path: str = None) -> pd.DataFrame:
"""Complete pipeline to extract, clean, and process articles"""
print(f"Extracting text from {len(urls)} articles...")
# Extract articles
df = extract_article_text(urls)
# Clean text
df['cleaned_text'] = df['text'].apply(clean_text)
# Create chunks for each article
df['text_chunks'] = df['cleaned_text'].apply(
lambda x: chunk_text(x) if pd.notna(x) else []
)
# Save if path provided
if save_path:
df.to_csv(save_path, index=False)
print(f"Results saved to {save_path}")
return df
if __name__ == "__main__":
# Example usage
sample_urls = [
"https://example.com/article1",
"https://example.com/article2"
]
# Process articles
# df = process_articles(sample_urls, "extracted_articles.csv")
# print(f"Extracted {len(df)} articles")
print("Article extractor ready! Use process_articles() with your URLs.")