Spaces:
Sleeping
Sleeping
Nikhil Pravin Pise commited on
Commit ·
a80eeb8
1
Parent(s): 81035fa
Initial deploy
Browse files- .env +4 -0
- Dockerfile +33 -0
- README.md +8 -7
- app.py +149 -0
- requirements.txt +13 -0
- server.py +386 -0
- src/__init__.py +0 -0
- src/__main__.py +6 -0
- src/__pycache__/__init__.cpython-313.pyc +0 -0
- src/__pycache__/__main__.cpython-313.pyc +0 -0
- src/__pycache__/config.cpython-313.pyc +0 -0
- src/__pycache__/main.cpython-313.pyc +0 -0
- src/__pycache__/parser.cpython-313.pyc +0 -0
- src/__pycache__/service.cpython-313.pyc +0 -0
- src/__pycache__/state.cpython-313.pyc +0 -0
- src/__pycache__/utils.cpython-313.pyc +0 -0
- src/config.py +86 -0
- src/main.py +180 -0
- src/parser.py +309 -0
- src/py.typed +0 -0
- src/service.py +279 -0
- src/state.py +69 -0
- src/utils.py +10 -0
.env
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Add your API keys in Hugging Face Space Settings (Secrets)
|
| 2 |
+
# GEMINI_API_KEY=
|
| 3 |
+
# OPENAI_API_KEY=
|
| 4 |
+
# ELEVENLABS_API_KEY=
|
Dockerfile
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.10-slim
|
| 2 |
+
|
| 3 |
+
# Set working directory
|
| 4 |
+
WORKDIR /app
|
| 5 |
+
|
| 6 |
+
# Install system dependencies
|
| 7 |
+
RUN apt-get update && apt-get install -y \
|
| 8 |
+
git \
|
| 9 |
+
wget \
|
| 10 |
+
gnupg \
|
| 11 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 12 |
+
|
| 13 |
+
# Copy requirements first to leverage cache
|
| 14 |
+
COPY requirements.txt .
|
| 15 |
+
|
| 16 |
+
# Install Python dependencies
|
| 17 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 18 |
+
|
| 19 |
+
# Install Playwright and browsers
|
| 20 |
+
RUN playwright install --with-deps chromium
|
| 21 |
+
|
| 22 |
+
# Copy the rest of the application
|
| 23 |
+
COPY . .
|
| 24 |
+
|
| 25 |
+
# Set environment variables
|
| 26 |
+
ENV PYTHONPATH=/app
|
| 27 |
+
ENV PYTHONUNBUFFERED=1
|
| 28 |
+
|
| 29 |
+
# Expose port 7860 for Gradio
|
| 30 |
+
EXPOSE 7860
|
| 31 |
+
|
| 32 |
+
# Run the application
|
| 33 |
+
CMD ["python", "app.py"]
|
README.md
CHANGED
|
@@ -1,12 +1,13 @@
|
|
| 1 |
---
|
| 2 |
-
title: Medium
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: docker
|
| 7 |
pinned: false
|
| 8 |
-
|
| 9 |
-
short_description: A MCP Server with a Scraper built in
|
| 10 |
---
|
| 11 |
|
| 12 |
-
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: Medium Agent
|
| 3 |
+
emoji: 📝
|
| 4 |
+
colorFrom: gray
|
| 5 |
+
colorTo: black
|
| 6 |
sdk: docker
|
| 7 |
pinned: false
|
| 8 |
+
app_port: 7860
|
|
|
|
| 9 |
---
|
| 10 |
|
| 11 |
+
# Medium Agent
|
| 12 |
+
|
| 13 |
+
A powerful Medium article scraper and audio generator.
|
app.py
ADDED
|
@@ -0,0 +1,149 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import asyncio
|
| 3 |
+
import os
|
| 4 |
+
import sys
|
| 5 |
+
import ast
|
| 6 |
+
from dotenv import load_dotenv
|
| 7 |
+
|
| 8 |
+
# Load environment variables
|
| 9 |
+
load_dotenv()
|
| 10 |
+
|
| 11 |
+
# Import tools from server
|
| 12 |
+
# We assume server.py is in the same directory
|
| 13 |
+
try:
|
| 14 |
+
from server import medium_search, medium_cast, medium_synthesize
|
| 15 |
+
except ImportError:
|
| 16 |
+
# If running locally with different structure, try to adjust path
|
| 17 |
+
sys.path.append(os.path.dirname(__file__))
|
| 18 |
+
from server import medium_search, medium_cast, medium_synthesize
|
| 19 |
+
|
| 20 |
+
async def search_wrapper(query):
|
| 21 |
+
if not query:
|
| 22 |
+
return "Please enter a query."
|
| 23 |
+
|
| 24 |
+
gr.Info(f"Searching for '{query}'...")
|
| 25 |
+
try:
|
| 26 |
+
# Get string result from tool
|
| 27 |
+
result_str = await medium_search(query)
|
| 28 |
+
|
| 29 |
+
# Parse string back to list
|
| 30 |
+
results = ast.literal_eval(result_str)
|
| 31 |
+
|
| 32 |
+
if not results:
|
| 33 |
+
return "No results found."
|
| 34 |
+
|
| 35 |
+
html = "<div style='display: grid; grid-template-columns: repeat(auto-fill, minmax(300px, 1fr)); gap: 20px;'>"
|
| 36 |
+
for art in results:
|
| 37 |
+
title = art.get('title', 'No Title')
|
| 38 |
+
url = art.get('url', '#')
|
| 39 |
+
author = art.get('author', {}).get('name', 'Unknown') if art.get('author') else 'Unknown'
|
| 40 |
+
publication = art.get('publication', '')
|
| 41 |
+
if publication and author == 'Unknown':
|
| 42 |
+
author = publication
|
| 43 |
+
|
| 44 |
+
img = art.get('imageUrl', '')
|
| 45 |
+
|
| 46 |
+
# Fallback image if empty
|
| 47 |
+
if not img:
|
| 48 |
+
img = "https://miro.medium.com/max/1400/1*jfdwtvU6V6g99q3G7gq7dQ.png"
|
| 49 |
+
|
| 50 |
+
html += f"""
|
| 51 |
+
<div style='border: 1px solid #ddd; border-radius: 8px; overflow: hidden; padding: 0; background: #2b2b2b; color: #fff; display: flex; flex-direction: column;'>
|
| 52 |
+
<div style='height: 160px; background-image: url("{img}"); background-size: cover; background-position: center;'></div>
|
| 53 |
+
<div style='padding: 15px; flex-grow: 1;'>
|
| 54 |
+
<h3 style='margin: 0 0 10px 0; font-size: 16px; line-height: 1.4;'><a href='{url}' target='_blank' style='color: #fff; text-decoration: none;'>{title}</a></h3>
|
| 55 |
+
<p style='margin: 0; font-size: 12px; color: #aaa;'>By {author}</p>
|
| 56 |
+
</div>
|
| 57 |
+
</div>
|
| 58 |
+
"""
|
| 59 |
+
html += "</div>"
|
| 60 |
+
return html
|
| 61 |
+
except Exception as e:
|
| 62 |
+
return f"Error parsing results: {e}. Raw output: {result_str}"
|
| 63 |
+
|
| 64 |
+
async def audio_wrapper(url, voice_id):
|
| 65 |
+
if not url:
|
| 66 |
+
return "Please enter a URL.", None
|
| 67 |
+
|
| 68 |
+
gr.Info("Generating Audio... This may take a minute.")
|
| 69 |
+
|
| 70 |
+
# Note: medium_cast uses Edge-TTS by default (free), so we don't strictly need API keys
|
| 71 |
+
# unless falling back to ElevenLabs/OpenAI
|
| 72 |
+
|
| 73 |
+
try:
|
| 74 |
+
result = await medium_cast(url, voice_id)
|
| 75 |
+
|
| 76 |
+
# Check if result is a path (success)
|
| 77 |
+
if "Audio generated successfully" in result:
|
| 78 |
+
# Extract path
|
| 79 |
+
try:
|
| 80 |
+
path = result.split(": ")[1].strip()
|
| 81 |
+
# Remove any markdown formatting if present
|
| 82 |
+
path = path.replace("`", "")
|
| 83 |
+
if os.path.exists(path):
|
| 84 |
+
return result, path
|
| 85 |
+
else:
|
| 86 |
+
return f"{result} (File not found at {path})", None
|
| 87 |
+
except:
|
| 88 |
+
return result, None
|
| 89 |
+
return result, None
|
| 90 |
+
except Exception as e:
|
| 91 |
+
return f"Error: {str(e)}", None
|
| 92 |
+
|
| 93 |
+
async def synthesize_wrapper(topic):
|
| 94 |
+
if not topic:
|
| 95 |
+
return "Please enter a topic."
|
| 96 |
+
|
| 97 |
+
# Check for Gemini Key (Primary)
|
| 98 |
+
if not os.environ.get("GEMINI_API_KEY") and not os.environ.get("OPENAI_API_KEY"):
|
| 99 |
+
return "⚠️ Warning: No GEMINI_API_KEY or OPENAI_API_KEY found. Synthesis might fail or return mock data."
|
| 100 |
+
|
| 101 |
+
gr.Info(f"Synthesizing report for '{topic}'... This involves scraping multiple articles and may take 2-3 minutes.")
|
| 102 |
+
try:
|
| 103 |
+
return await medium_synthesize(topic)
|
| 104 |
+
except Exception as e:
|
| 105 |
+
return f"Error during synthesis: {str(e)}"
|
| 106 |
+
|
| 107 |
+
# Build UI
|
| 108 |
+
with gr.Blocks(title="Medium Agent", theme=gr.themes.Soft()) as demo:
|
| 109 |
+
gr.Markdown("# 📝 Medium Agent")
|
| 110 |
+
gr.Markdown("Search, Read, and Listen to Medium articles. Powered by MCP and Playwright.")
|
| 111 |
+
|
| 112 |
+
with gr.Tab("🔍 Search"):
|
| 113 |
+
gr.Markdown("### Search Medium Articles")
|
| 114 |
+
with gr.Row():
|
| 115 |
+
search_input = gr.Textbox(label="Query", placeholder="e.g. AI Agents", scale=4)
|
| 116 |
+
search_btn = gr.Button("Search", variant="primary", scale=1)
|
| 117 |
+
search_output = gr.HTML(label="Results")
|
| 118 |
+
search_btn.click(search_wrapper, inputs=search_input, outputs=search_output)
|
| 119 |
+
search_input.submit(search_wrapper, inputs=search_input, outputs=search_output)
|
| 120 |
+
|
| 121 |
+
with gr.Tab("🎧 Audio Article"):
|
| 122 |
+
gr.Markdown("### Convert Article to Audio")
|
| 123 |
+
gr.Markdown("Uses Edge-TTS (Free) by default. Falls back to ElevenLabs/OpenAI if configured.")
|
| 124 |
+
with gr.Row():
|
| 125 |
+
url_input = gr.Textbox(label="Article URL", placeholder="https://medium.com/...", scale=4)
|
| 126 |
+
audio_btn = gr.Button("Generate Audio", variant="primary", scale=1)
|
| 127 |
+
|
| 128 |
+
with gr.Accordion("Advanced Options", open=False):
|
| 129 |
+
voice_input = gr.Textbox(label="Voice ID (for ElevenLabs)", value="JBFqnCBsd6RMkjVDRZzb")
|
| 130 |
+
|
| 131 |
+
# Output status and audio player
|
| 132 |
+
audio_status = gr.Textbox(label="Status", interactive=False)
|
| 133 |
+
audio_player = gr.Audio(label="Play Audio", type="filepath")
|
| 134 |
+
|
| 135 |
+
audio_btn.click(audio_wrapper, inputs=[url_input, voice_input], outputs=[audio_status, audio_player])
|
| 136 |
+
|
| 137 |
+
with gr.Tab("🧠 Smart Synthesis"):
|
| 138 |
+
gr.Markdown("### Generate 'State of the Union' Report")
|
| 139 |
+
gr.Markdown("Scrapes top articles on a topic and uses Gemini/OpenAI to generate a comprehensive report.")
|
| 140 |
+
with gr.Row():
|
| 141 |
+
topic_input = gr.Textbox(label="Topic", placeholder="e.g. Generative AI", scale=4)
|
| 142 |
+
synth_btn = gr.Button("Synthesize", variant="primary", scale=1)
|
| 143 |
+
synth_output = gr.Markdown(label="Report")
|
| 144 |
+
synth_btn.click(synthesize_wrapper, inputs=topic_input, outputs=synth_output)
|
| 145 |
+
topic_input.submit(synthesize_wrapper, inputs=topic_input, outputs=synth_output)
|
| 146 |
+
|
| 147 |
+
if __name__ == "__main__":
|
| 148 |
+
# Launch with 0.0.0.0 for Docker/Cloud support
|
| 149 |
+
demo.launch(server_name="0.0.0.0", server_port=7860)
|
requirements.txt
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
playwright>=1.40.0
|
| 2 |
+
beautifulsoup4>=4.12.0
|
| 3 |
+
markdownify>=0.11.6
|
| 4 |
+
httpx>=0.25.0
|
| 5 |
+
aiofiles>=23.2.1
|
| 6 |
+
google-generativeai>=0.3.0
|
| 7 |
+
openai>=1.3.0
|
| 8 |
+
edge-tts>=6.1.0
|
| 9 |
+
elevenlabs>=0.2.0
|
| 10 |
+
mcp>=0.9.0
|
| 11 |
+
fastmcp>=0.2.0
|
| 12 |
+
python-dotenv>=1.0.0
|
| 13 |
+
gradio>=4.0.0
|
server.py
ADDED
|
@@ -0,0 +1,386 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
import os
|
| 3 |
+
import asyncio
|
| 4 |
+
import httpx
|
| 5 |
+
from typing import List, Optional
|
| 6 |
+
from elevenlabs.client import ElevenLabs
|
| 7 |
+
from openai import AsyncOpenAI
|
| 8 |
+
import google.generativeai as genai
|
| 9 |
+
import edge_tts
|
| 10 |
+
|
| 11 |
+
# Add sibling 'Medium-Scraper' directory to sys.path to access 'src'
|
| 12 |
+
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../Medium-Scraper"))
|
| 13 |
+
if project_root not in sys.path:
|
| 14 |
+
sys.path.insert(0, project_root)
|
| 15 |
+
|
| 16 |
+
from mcp.server.fastmcp import FastMCP, Context, Image
|
| 17 |
+
from src.service import ScraperService
|
| 18 |
+
|
| 19 |
+
# Initialize FastMCP
|
| 20 |
+
mcp = FastMCP("Medium Scraper")
|
| 21 |
+
|
| 22 |
+
# Initialize Scraper Service (Worker Pool)
|
| 23 |
+
scraper = ScraperService(max_workers=5)
|
| 24 |
+
|
| 25 |
+
# --- Resources ---
|
| 26 |
+
|
| 27 |
+
@mcp.resource("medium://trending")
|
| 28 |
+
async def get_trending(ctx: Context = None) -> str:
|
| 29 |
+
"""Returns the top trending articles on Medium."""
|
| 30 |
+
# We use the 'trending' tag as a proxy
|
| 31 |
+
if ctx:
|
| 32 |
+
await ctx.info("Fetching trending articles...")
|
| 33 |
+
|
| 34 |
+
results = await scraper.scrape_tag("trending", max_articles=10, progress_callback=ctx.info if ctx else None)
|
| 35 |
+
return str(results)
|
| 36 |
+
|
| 37 |
+
@mcp.resource("medium://tag/{tag}")
|
| 38 |
+
async def get_tag_feed(tag: str, ctx: Context = None) -> str:
|
| 39 |
+
"""Returns the latest articles for a specific tag."""
|
| 40 |
+
if ctx:
|
| 41 |
+
await ctx.info(f"Fetching articles for tag: {tag}...")
|
| 42 |
+
|
| 43 |
+
results = await scraper.scrape_tag(tag, max_articles=10, progress_callback=ctx.info if ctx else None)
|
| 44 |
+
return str(results)
|
| 45 |
+
|
| 46 |
+
# --- Tools ---
|
| 47 |
+
|
| 48 |
+
@mcp.tool()
|
| 49 |
+
async def medium_search(query: str, ctx: Context = None) -> str:
|
| 50 |
+
"""
|
| 51 |
+
Search Medium for articles.
|
| 52 |
+
Args:
|
| 53 |
+
query: The search query (e.g. "AI Agents", "Python Asyncio")
|
| 54 |
+
"""
|
| 55 |
+
if ctx:
|
| 56 |
+
await ctx.info(f"Searching for: {query}...")
|
| 57 |
+
|
| 58 |
+
results = await scraper.scrape_search(query, progress_callback=ctx.info if ctx else None)
|
| 59 |
+
return str(results)
|
| 60 |
+
|
| 61 |
+
@mcp.tool()
|
| 62 |
+
async def medium_fresh(tag: str, ctx: Context = None) -> str:
|
| 63 |
+
"""
|
| 64 |
+
Get the latest articles for a specific tag (Freshness).
|
| 65 |
+
Args:
|
| 66 |
+
tag: The topic tag (e.g. "Artificial Intelligence")
|
| 67 |
+
"""
|
| 68 |
+
if ctx:
|
| 69 |
+
await ctx.info(f"Fetching fresh articles for: {tag}...")
|
| 70 |
+
|
| 71 |
+
results = await scraper.scrape_tag(tag, progress_callback=ctx.info if ctx else None)
|
| 72 |
+
return str(results)
|
| 73 |
+
|
| 74 |
+
@mcp.tool()
|
| 75 |
+
async def get_thumbnail(image_url: str, ctx: Context = None) -> Image:
|
| 76 |
+
"""
|
| 77 |
+
Fetch an image from a URL and return it as an MCP Image object.
|
| 78 |
+
Args:
|
| 79 |
+
image_url: The URL of the image to fetch.
|
| 80 |
+
"""
|
| 81 |
+
if ctx:
|
| 82 |
+
await ctx.info(f"Fetching image: {image_url}...")
|
| 83 |
+
async with httpx.AsyncClient() as client:
|
| 84 |
+
response = await client.get(image_url)
|
| 85 |
+
response.raise_for_status()
|
| 86 |
+
return Image(data=response.content, format="png")
|
| 87 |
+
|
| 88 |
+
@mcp.tool()
|
| 89 |
+
async def medium_cast(url: str, voice_id: str = "JBFqnCBsd6RMkjVDRZzb", ctx: Context = None) -> str:
|
| 90 |
+
"""
|
| 91 |
+
Convert a Medium article into audio using ElevenLabs.
|
| 92 |
+
Args:
|
| 93 |
+
url: The URL of the article.
|
| 94 |
+
voice_id: The ElevenLabs voice ID to use (default: 'JBFqnCBsd6RMkjVDRZzb' - George).
|
| 95 |
+
"""
|
| 96 |
+
api_key = os.environ.get("ELEVENLABS_API_KEY")
|
| 97 |
+
if not api_key:
|
| 98 |
+
return "Error: ELEVENLABS_API_KEY not set."
|
| 99 |
+
|
| 100 |
+
if ctx:
|
| 101 |
+
await ctx.info(f"Scraping article for audio: {url}...")
|
| 102 |
+
|
| 103 |
+
try:
|
| 104 |
+
article = await scraper.scrape_article(url)
|
| 105 |
+
if not article:
|
| 106 |
+
return "Error: Failed to scrape article (returned None)."
|
| 107 |
+
|
| 108 |
+
text = article.get("markdownContent", "")
|
| 109 |
+
title = article.get("title") or "Article"
|
| 110 |
+
|
| 111 |
+
author_data = article.get("author")
|
| 112 |
+
author = author_data.get("name") if (author_data and isinstance(author_data, dict)) else None
|
| 113 |
+
publication = article.get("publication") # New: get publication separately
|
| 114 |
+
|
| 115 |
+
if ctx:
|
| 116 |
+
await ctx.info("Scraping complete. Processing text...")
|
| 117 |
+
|
| 118 |
+
# Handle missing/blocked content
|
| 119 |
+
if not text or "Could not extract" in text or "Verify you are human" in text:
|
| 120 |
+
# Try to construct a fallback script
|
| 121 |
+
# If text starts with "Summary:", it means we got the meta description
|
| 122 |
+
description = ""
|
| 123 |
+
if text and text.startswith("Summary:"):
|
| 124 |
+
description = text.replace("Summary:", "").strip()
|
| 125 |
+
|
| 126 |
+
# Build attribution line
|
| 127 |
+
attribution = f"Title: {title}."
|
| 128 |
+
if author:
|
| 129 |
+
attribution += f" By {author}."
|
| 130 |
+
elif publication:
|
| 131 |
+
attribution += f" Published by {publication}."
|
| 132 |
+
|
| 133 |
+
text = attribution + " "
|
| 134 |
+
|
| 135 |
+
if description:
|
| 136 |
+
text += f"Here is a summary: {description}. "
|
| 137 |
+
text += "I could not retrieve the full text due to access restrictions, but I hope this summary is helpful."
|
| 138 |
+
else:
|
| 139 |
+
text += "I could not retrieve the full text of this article due to access restrictions, but I encourage you to read it on Medium."
|
| 140 |
+
|
| 141 |
+
# Final validation
|
| 142 |
+
if not text or len(text.strip()) < 10:
|
| 143 |
+
return "Error: No text available to generate audio."
|
| 144 |
+
|
| 145 |
+
# Truncate for TTS (save cost/time)
|
| 146 |
+
if len(text) > 2500:
|
| 147 |
+
text = text[:2500] + "... (end of preview)"
|
| 148 |
+
|
| 149 |
+
import uuid
|
| 150 |
+
output_filename = f"output_{uuid.uuid4().hex}.mp3"
|
| 151 |
+
output_path = os.path.join(os.path.dirname(__file__), output_filename)
|
| 152 |
+
|
| 153 |
+
if ctx:
|
| 154 |
+
await ctx.info(f"Text prepared ({len(text)} chars). Starting audio generation...")
|
| 155 |
+
|
| 156 |
+
# 1. Try Edge-TTS (Free, High Quality)
|
| 157 |
+
try:
|
| 158 |
+
if ctx:
|
| 159 |
+
await ctx.info("Generating audio with Edge-TTS (Free)...")
|
| 160 |
+
|
| 161 |
+
# Voice: en-US-ChristopherNeural (Male) or en-US-AriaNeural (Female)
|
| 162 |
+
communicate = edge_tts.Communicate(text, "en-US-ChristopherNeural")
|
| 163 |
+
await communicate.save(output_path)
|
| 164 |
+
|
| 165 |
+
# Verify file
|
| 166 |
+
if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
|
| 167 |
+
return f"Audio generated successfully (via Edge-TTS): {output_path}"
|
| 168 |
+
else:
|
| 169 |
+
raise Exception("Edge-TTS generated empty file.")
|
| 170 |
+
|
| 171 |
+
except Exception as e:
|
| 172 |
+
if ctx:
|
| 173 |
+
await ctx.info(f"Edge-TTS failed: {e}. Falling back to ElevenLabs...")
|
| 174 |
+
|
| 175 |
+
# 2. Fallback: ElevenLabs
|
| 176 |
+
api_key = os.environ.get("ELEVENLABS_API_KEY")
|
| 177 |
+
if api_key:
|
| 178 |
+
try:
|
| 179 |
+
if ctx:
|
| 180 |
+
await ctx.info("Generating audio with ElevenLabs...")
|
| 181 |
+
|
| 182 |
+
# Run blocking I/O in thread
|
| 183 |
+
def _run_elevenlabs():
|
| 184 |
+
client = ElevenLabs(api_key=api_key)
|
| 185 |
+
audio_generator = client.text_to_speech.convert(
|
| 186 |
+
text=text,
|
| 187 |
+
voice_id=voice_id,
|
| 188 |
+
model_id="eleven_multilingual_v2",
|
| 189 |
+
output_format="mp3_44100_128",
|
| 190 |
+
)
|
| 191 |
+
with open(output_path, "wb") as f:
|
| 192 |
+
for chunk in audio_generator:
|
| 193 |
+
f.write(chunk)
|
| 194 |
+
|
| 195 |
+
await asyncio.to_thread(_run_elevenlabs)
|
| 196 |
+
|
| 197 |
+
if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
|
| 198 |
+
return f"Audio generated successfully: {output_path}"
|
| 199 |
+
except Exception as e:
|
| 200 |
+
error_msg = str(e)
|
| 201 |
+
if "quota_exceeded" in error_msg:
|
| 202 |
+
if ctx:
|
| 203 |
+
await ctx.info("ElevenLabs quota exceeded. Falling back to OpenAI TTS...")
|
| 204 |
+
else:
|
| 205 |
+
if ctx:
|
| 206 |
+
await ctx.info(f"ElevenLabs failed: {e}. Falling back to OpenAI TTS...")
|
| 207 |
+
|
| 208 |
+
# 3. Fallback: OpenAI
|
| 209 |
+
openai_key = os.environ.get("OPENAI_API_KEY")
|
| 210 |
+
if not openai_key:
|
| 211 |
+
return "Error: Edge-TTS failed, ElevenLabs failed/missing, and OPENAI_API_KEY not set."
|
| 212 |
+
|
| 213 |
+
try:
|
| 214 |
+
if ctx:
|
| 215 |
+
await ctx.info("Generating audio with OpenAI TTS...")
|
| 216 |
+
|
| 217 |
+
client = AsyncOpenAI(api_key=openai_key)
|
| 218 |
+
response = await client.audio.speech.create(
|
| 219 |
+
model="tts-1",
|
| 220 |
+
voice="alloy",
|
| 221 |
+
input=text[:4096] # OpenAI limit
|
| 222 |
+
)
|
| 223 |
+
|
| 224 |
+
response.stream_to_file(output_path)
|
| 225 |
+
return f"Audio generated successfully (via OpenAI Fallback): {output_path}"
|
| 226 |
+
except Exception as e2:
|
| 227 |
+
if "insufficient_quota" in str(e2) or "429" in str(e2):
|
| 228 |
+
return "Error: All TTS services (Edge-TTS, ElevenLabs, OpenAI) failed or quotas exceeded."
|
| 229 |
+
return f"Error generating audio (All fallbacks failed): {str(e2)}"
|
| 230 |
+
|
| 231 |
+
except Exception as e:
|
| 232 |
+
return f"Error generating audio: {str(e)}"
|
| 233 |
+
|
| 234 |
+
|
| 235 |
+
|
| 236 |
+
@mcp.tool()
|
| 237 |
+
async def medium_synthesize(topic: str, ctx: Context = None) -> str:
|
| 238 |
+
"""
|
| 239 |
+
Synthesize a 'State of the Union' report on a topic using top Medium articles.
|
| 240 |
+
Args:
|
| 241 |
+
topic: The topic to analyze (e.g. "Generative AI").
|
| 242 |
+
"""
|
| 243 |
+
api_key = os.environ.get("OPENAI_API_KEY")
|
| 244 |
+
# We check for OpenAI key as a baseline, but Gemini key is checked inside
|
| 245 |
+
if not api_key and not os.environ.get("GEMINI_API_KEY"):
|
| 246 |
+
return "Error: Neither OPENAI_API_KEY nor GEMINI_API_KEY is set."
|
| 247 |
+
|
| 248 |
+
if ctx:
|
| 249 |
+
await ctx.info(f"Scraping top articles for: {topic}...")
|
| 250 |
+
|
| 251 |
+
# 1. Scrape Top Articles
|
| 252 |
+
articles = await scraper.scrape_search(topic, max_articles=5, progress_callback=ctx.info if ctx else None)
|
| 253 |
+
|
| 254 |
+
if not articles:
|
| 255 |
+
return "No articles found to synthesize."
|
| 256 |
+
|
| 257 |
+
# 2. Prepare Context for LLM (Parallel Scraping)
|
| 258 |
+
if ctx:
|
| 259 |
+
await ctx.info(f"Deep scraping {len(articles)} articles in parallel...")
|
| 260 |
+
|
| 261 |
+
async def _scrape_single_article(art):
|
| 262 |
+
url = art.get('url')
|
| 263 |
+
title = art.get('title')
|
| 264 |
+
author = art.get('author', {}).get('name')
|
| 265 |
+
|
| 266 |
+
content = ""
|
| 267 |
+
try:
|
| 268 |
+
full_art = await scraper.scrape_article(url)
|
| 269 |
+
content = full_art.get("markdownContent", "")
|
| 270 |
+
if "Could not extract" in content:
|
| 271 |
+
content = ""
|
| 272 |
+
except Exception:
|
| 273 |
+
content = ""
|
| 274 |
+
|
| 275 |
+
if not content:
|
| 276 |
+
content = f"Summary: {title} by {author}. (Full content unavailable)."
|
| 277 |
+
|
| 278 |
+
# Truncate
|
| 279 |
+
content = content[:2000] + "..." if len(content) > 2000 else content
|
| 280 |
+
return f"\nTitle: {title}\nURL: {url}\nAuthor: {author}\nContent:\n{content}\n"
|
| 281 |
+
|
| 282 |
+
# Run all scrapes concurrently
|
| 283 |
+
results = await asyncio.gather(*[_scrape_single_article(art) for art in articles])
|
| 284 |
+
context_text = "".join(results)
|
| 285 |
+
|
| 286 |
+
if ctx:
|
| 287 |
+
await ctx.info("Synthesizing insights...")
|
| 288 |
+
|
| 289 |
+
# 3. Call LLM (Gemini First)
|
| 290 |
+
gemini_key = os.environ.get("GEMINI_API_KEY")
|
| 291 |
+
report = ""
|
| 292 |
+
|
| 293 |
+
# Try Gemini
|
| 294 |
+
if gemini_key:
|
| 295 |
+
try:
|
| 296 |
+
if ctx:
|
| 297 |
+
await ctx.info("Synthesizing with Gemini (2.5-Flash)...")
|
| 298 |
+
genai.configure(api_key=gemini_key)
|
| 299 |
+
model = genai.GenerativeModel('gemini-2.5-flash')
|
| 300 |
+
|
| 301 |
+
prompt = f"""You are a tech analyst. Synthesize the following Medium articles into a 'State of the Union' report.
|
| 302 |
+
Highlight trends, key players, and sentiment.
|
| 303 |
+
|
| 304 |
+
Topic: {topic}
|
| 305 |
+
|
| 306 |
+
Articles:
|
| 307 |
+
{context_text}
|
| 308 |
+
"""
|
| 309 |
+
|
| 310 |
+
# Gemini async generation
|
| 311 |
+
response = await model.generate_content_async(prompt)
|
| 312 |
+
report = response.text
|
| 313 |
+
return report
|
| 314 |
+
except Exception as e:
|
| 315 |
+
if ctx:
|
| 316 |
+
await ctx.info(f"Gemini failed: {e}. Falling back to OpenAI...")
|
| 317 |
+
|
| 318 |
+
# Fallback: OpenAI
|
| 319 |
+
if not api_key:
|
| 320 |
+
return f"Error: Gemini failed/missing and OPENAI_API_KEY not set. Gemini Error: {report if 'report' in locals() else 'N/A'}"
|
| 321 |
+
|
| 322 |
+
client = AsyncOpenAI(api_key=api_key)
|
| 323 |
+
try:
|
| 324 |
+
response = await client.chat.completions.create(
|
| 325 |
+
model="gpt-4o",
|
| 326 |
+
messages=[
|
| 327 |
+
{"role": "system", "content": "You are a tech analyst. Synthesize the following Medium articles into a 'State of the Union' report. Highlight trends, key players, and sentiment."},
|
| 328 |
+
{"role": "user", "content": f"Topic: {topic}\n\nArticles:\n{context_text}"}
|
| 329 |
+
]
|
| 330 |
+
)
|
| 331 |
+
report = response.choices[0].message.content
|
| 332 |
+
except Exception as e:
|
| 333 |
+
if "insufficient_quota" in str(e) or "429" in str(e):
|
| 334 |
+
if ctx:
|
| 335 |
+
await ctx.info("OpenAI quota exceeded. Generating summary report locally...")
|
| 336 |
+
|
| 337 |
+
# Mock Fallback: Generate a report based on titles and authors
|
| 338 |
+
report = f"# State of the Union: {topic} (Generated Locally)\n\n"
|
| 339 |
+
report += "> **Note:** External AI services (Gemini & OpenAI) are currently unavailable or quota exceeded. This report is generated based on available metadata.\n\n"
|
| 340 |
+
report += "## Key Articles Analyzed\n"
|
| 341 |
+
for i, art in enumerate(articles):
|
| 342 |
+
report += f"- **{art.get('title')}** by {art.get('author', {}).get('name')}\n"
|
| 343 |
+
|
| 344 |
+
report += "\n## Summary\n"
|
| 345 |
+
report += f"Recent discussions on **{topic}** focus on the themes presented in the articles above. "
|
| 346 |
+
report += "Readers are encouraged to explore the full articles for in-depth analysis."
|
| 347 |
+
else:
|
| 348 |
+
report = f"Error generating report: {str(e)}"
|
| 349 |
+
|
| 350 |
+
return report
|
| 351 |
+
|
| 352 |
+
# --- Prompts ---
|
| 353 |
+
|
| 354 |
+
@mcp.prompt()
|
| 355 |
+
def summarize_article(url: str) -> str:
|
| 356 |
+
"""Create a prompt to summarize a Medium article."""
|
| 357 |
+
return f"Please read and summarize the following Medium article: {url}\n\nFocus on the key takeaways and novel insights."
|
| 358 |
+
|
| 359 |
+
@mcp.prompt()
|
| 360 |
+
def tweet_thread(url: str) -> str:
|
| 361 |
+
"""Create a prompt to turn an article into a Twitter thread."""
|
| 362 |
+
return f"Read this article: {url}\n\nConvert it into a viral 5-tweet thread. Use emojis and keep it punchy."
|
| 363 |
+
|
| 364 |
+
# --- Completions (Autocomplete) ---
|
| 365 |
+
|
| 366 |
+
COMMON_TAGS = [
|
| 367 |
+
"Artificial Intelligence", "Machine Learning", "Data Science", "Programming",
|
| 368 |
+
"Python", "JavaScript", "Startup", "Technology", "Writing", "Life Lessons",
|
| 369 |
+
"Productivity", "Design", "Marketing", "Business", "Health"
|
| 370 |
+
]
|
| 371 |
+
|
| 372 |
+
@mcp.tool()
|
| 373 |
+
async def medium_search_with_autocomplete(
|
| 374 |
+
tag: str,
|
| 375 |
+
ctx: Context = None
|
| 376 |
+
) -> str:
|
| 377 |
+
"""
|
| 378 |
+
Search Medium with tag autocomplete support.
|
| 379 |
+
"""
|
| 380 |
+
# Note: True autocomplete requires client-side support via the completion API,
|
| 381 |
+
# which FastMCP handles for Enums. For open strings, we provide this tool
|
| 382 |
+
# as a hint for future expansion.
|
| 383 |
+
return await medium_fresh(tag)
|
| 384 |
+
|
| 385 |
+
if __name__ == "__main__":
|
| 386 |
+
mcp.run()
|
src/__init__.py
ADDED
|
File without changes
|
src/__main__.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
|
| 3 |
+
from .main import main
|
| 4 |
+
|
| 5 |
+
# Execute the Actor entry point.
|
| 6 |
+
asyncio.run(main())
|
src/__pycache__/__init__.cpython-313.pyc
ADDED
|
Binary file (147 Bytes). View file
|
|
|
src/__pycache__/__main__.cpython-313.pyc
ADDED
|
Binary file (269 Bytes). View file
|
|
|
src/__pycache__/config.cpython-313.pyc
ADDED
|
Binary file (3.76 kB). View file
|
|
|
src/__pycache__/main.cpython-313.pyc
ADDED
|
Binary file (9.89 kB). View file
|
|
|
src/__pycache__/parser.cpython-313.pyc
ADDED
|
Binary file (10.6 kB). View file
|
|
|
src/__pycache__/service.cpython-313.pyc
ADDED
|
Binary file (18.3 kB). View file
|
|
|
src/__pycache__/state.cpython-313.pyc
ADDED
|
Binary file (4 kB). View file
|
|
|
src/__pycache__/utils.cpython-313.pyc
ADDED
|
Binary file (683 Bytes). View file
|
|
|
src/config.py
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pydantic import BaseModel, Field
|
| 2 |
+
from typing import Optional, List, Dict, Any
|
| 3 |
+
from apify import Actor
|
| 4 |
+
|
| 5 |
+
class ActorInput(BaseModel):
|
| 6 |
+
"""
|
| 7 |
+
Defines the expected input schema for the Actor.
|
| 8 |
+
"""
|
| 9 |
+
search_query: Optional[str] = Field(
|
| 10 |
+
default=None,
|
| 11 |
+
alias="searchQuery",
|
| 12 |
+
description="Search query to run on Medium."
|
| 13 |
+
)
|
| 14 |
+
tag: Optional[str] = Field(
|
| 15 |
+
default=None,
|
| 16 |
+
alias="tag",
|
| 17 |
+
description="Topic tag to scrape (e.g. 'AI Agents')."
|
| 18 |
+
)
|
| 19 |
+
start_urls: List[Dict[str, Any]] = Field(
|
| 20 |
+
default_factory=lambda: [{"url": "https://medium.com"}],
|
| 21 |
+
alias="start_urls",
|
| 22 |
+
description="List of start URLs."
|
| 23 |
+
)
|
| 24 |
+
max_requests_per_crawl: int = Field(
|
| 25 |
+
default=100,
|
| 26 |
+
alias="maxRequestsPerCrawl",
|
| 27 |
+
description="Maximum number of requests to process."
|
| 28 |
+
)
|
| 29 |
+
min_concurrency: int = Field(
|
| 30 |
+
default=1,
|
| 31 |
+
alias="minConcurrency",
|
| 32 |
+
description="Minimum number of parallel requests."
|
| 33 |
+
)
|
| 34 |
+
max_concurrency: int = Field(
|
| 35 |
+
default=10,
|
| 36 |
+
alias="maxConcurrency",
|
| 37 |
+
description="Maximum number of parallel requests."
|
| 38 |
+
)
|
| 39 |
+
max_request_retries: int = Field(
|
| 40 |
+
default=3,
|
| 41 |
+
alias="maxRequestRetries",
|
| 42 |
+
description="Number of retries for failed requests."
|
| 43 |
+
)
|
| 44 |
+
max_articles: int = Field(
|
| 45 |
+
default=5,
|
| 46 |
+
alias="maxArticles",
|
| 47 |
+
description="Maximum number of articles to scrape from search results."
|
| 48 |
+
)
|
| 49 |
+
scrape_full_content: bool = Field(
|
| 50 |
+
default=False,
|
| 51 |
+
alias="scrapeFullContent",
|
| 52 |
+
description="If True, visit article pages and extract full content."
|
| 53 |
+
)
|
| 54 |
+
enable_deduplication: bool = Field(
|
| 55 |
+
default=True,
|
| 56 |
+
alias="enableDeduplication",
|
| 57 |
+
description="If True, skip previously seen articles."
|
| 58 |
+
)
|
| 59 |
+
proxy_configuration: Optional[Dict[str, Any]] = Field(
|
| 60 |
+
default=None,
|
| 61 |
+
alias="proxyConfiguration",
|
| 62 |
+
description="Proxy configuration settings."
|
| 63 |
+
)
|
| 64 |
+
|
| 65 |
+
@classmethod
|
| 66 |
+
async def load(cls) -> "ActorInput":
|
| 67 |
+
"""
|
| 68 |
+
Loads input from the Actor and validates it against the schema.
|
| 69 |
+
"""
|
| 70 |
+
actor_input = await Actor.get_input()
|
| 71 |
+
if not actor_input:
|
| 72 |
+
import json
|
| 73 |
+
import os
|
| 74 |
+
from pathlib import Path
|
| 75 |
+
cwd = Path.cwd()
|
| 76 |
+
local_input = cwd / "local_input.json"
|
| 77 |
+
Actor.log.info(f"Checking for local input at: {local_input}")
|
| 78 |
+
if local_input.exists():
|
| 79 |
+
Actor.log.info(f"Loading input from local file: {local_input}")
|
| 80 |
+
actor_input = json.loads(local_input.read_text(encoding="utf-8"))
|
| 81 |
+
else:
|
| 82 |
+
Actor.log.warning(f"Local input not found at {local_input}")
|
| 83 |
+
actor_input = {}
|
| 84 |
+
|
| 85 |
+
Actor.log.info(f"Raw Actor Input: {actor_input}")
|
| 86 |
+
return cls(**actor_input)
|
src/main.py
ADDED
|
@@ -0,0 +1,180 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from bs4 import BeautifulSoup
|
| 2 |
+
from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
|
| 3 |
+
from crawlee import Request
|
| 4 |
+
from apify import Actor
|
| 5 |
+
import asyncio
|
| 6 |
+
from datetime import timedelta
|
| 7 |
+
|
| 8 |
+
from src.config import ActorInput
|
| 9 |
+
from src.parser import extract_search_results, extract_article_content
|
| 10 |
+
from src.utils import block_resources
|
| 11 |
+
from src.state import StateManager
|
| 12 |
+
|
| 13 |
+
async def main():
|
| 14 |
+
await Actor.init()
|
| 15 |
+
|
| 16 |
+
try:
|
| 17 |
+
# Load and validate input
|
| 18 |
+
actor_input = await ActorInput.load()
|
| 19 |
+
Actor.log.info(f"Loaded Input: {actor_input}")
|
| 20 |
+
|
| 21 |
+
# Initialize State Manager
|
| 22 |
+
state_manager = None
|
| 23 |
+
if actor_input.enable_deduplication:
|
| 24 |
+
state_manager = StateManager()
|
| 25 |
+
await state_manager.load_state()
|
| 26 |
+
else:
|
| 27 |
+
Actor.log.info("Deduplication disabled. Scraping all articles.")
|
| 28 |
+
|
| 29 |
+
# Build start URLs
|
| 30 |
+
start_urls = []
|
| 31 |
+
if actor_input.tag:
|
| 32 |
+
# Tag-based scraping (Freshness)
|
| 33 |
+
tag_slug = actor_input.tag.lower().replace(" ", "-")
|
| 34 |
+
start_urls.append(f"https://medium.com/tag/{tag_slug}/latest")
|
| 35 |
+
Actor.log.info(f"Targeting Tag: {actor_input.tag} (Latest)")
|
| 36 |
+
elif actor_input.search_query:
|
| 37 |
+
# Search-based scraping
|
| 38 |
+
q = actor_input.search_query.replace(" ", "+")
|
| 39 |
+
start_urls.append(f"https://medium.com/search?q={q}")
|
| 40 |
+
|
| 41 |
+
# Add explicit start URLs if provided
|
| 42 |
+
for u in actor_input.start_urls:
|
| 43 |
+
if u.get("url") and "medium.com" in u["url"]:
|
| 44 |
+
start_urls.append(u["url"])
|
| 45 |
+
|
| 46 |
+
if not start_urls:
|
| 47 |
+
Actor.log.info("No search query or valid start URLs provided. Exiting.")
|
| 48 |
+
await Actor.exit()
|
| 49 |
+
return
|
| 50 |
+
|
| 51 |
+
# Create proxy configuration
|
| 52 |
+
proxy_config = None
|
| 53 |
+
if actor_input.proxy_configuration:
|
| 54 |
+
proxy_config = await Actor.create_proxy_configuration(
|
| 55 |
+
actor_proxy_input=actor_input.proxy_configuration
|
| 56 |
+
)
|
| 57 |
+
|
| 58 |
+
crawler = PlaywrightCrawler(
|
| 59 |
+
proxy_configuration=proxy_config,
|
| 60 |
+
max_requests_per_crawl=actor_input.max_requests_per_crawl,
|
| 61 |
+
max_request_retries=actor_input.max_request_retries,
|
| 62 |
+
request_handler_timeout=timedelta(seconds=60),
|
| 63 |
+
)
|
| 64 |
+
|
| 65 |
+
@crawler.router.default_handler
|
| 66 |
+
async def handler(context: PlaywrightCrawlingContext):
|
| 67 |
+
url = context.request.url
|
| 68 |
+
Actor.log.info(f"Processing: {url}")
|
| 69 |
+
|
| 70 |
+
# Enable resource blocking
|
| 71 |
+
await context.page.route("**/*", block_resources)
|
| 72 |
+
|
| 73 |
+
# Wait for content
|
| 74 |
+
try:
|
| 75 |
+
Actor.log.info("Waiting for selectors...")
|
| 76 |
+
await context.page.wait_for_load_state("domcontentloaded")
|
| 77 |
+
await context.page.wait_for_selector("article, .postArticle, .js-block", timeout=10000)
|
| 78 |
+
Actor.log.info("Selectors found.")
|
| 79 |
+
except Exception as e:
|
| 80 |
+
Actor.log.warning(f"Timeout waiting for selectors on {url}: {e}")
|
| 81 |
+
|
| 82 |
+
# Parse Content
|
| 83 |
+
html = await context.page.content()
|
| 84 |
+
soup = BeautifulSoup(html, "html.parser")
|
| 85 |
+
|
| 86 |
+
# --- Router Logic ---
|
| 87 |
+
|
| 88 |
+
# 1. Article Page (Deep Scraping)
|
| 89 |
+
if context.request.label == "ARTICLE" or "/@/" in url or "-{12}" in url:
|
| 90 |
+
Actor.log.info(f"Scraping Article Content: {url}")
|
| 91 |
+
|
| 92 |
+
user_data = context.request.user_data
|
| 93 |
+
if not isinstance(user_data, dict):
|
| 94 |
+
user_data = {}
|
| 95 |
+
|
| 96 |
+
try:
|
| 97 |
+
loop = asyncio.get_running_loop()
|
| 98 |
+
content_data = await loop.run_in_executor(None, extract_article_content, soup)
|
| 99 |
+
Actor.log.info(f"Extracted content keys: {list(content_data.keys())}")
|
| 100 |
+
if content_data.get("markdownContent"):
|
| 101 |
+
Actor.log.info(f"Markdown length: {len(content_data['markdownContent'])}")
|
| 102 |
+
else:
|
| 103 |
+
Actor.log.warning("No markdown content extracted.")
|
| 104 |
+
except Exception as e:
|
| 105 |
+
Actor.log.error(f"Error extracting content: {e}")
|
| 106 |
+
content_data = {}
|
| 107 |
+
|
| 108 |
+
# Merge metadata
|
| 109 |
+
final_data = user_data.copy()
|
| 110 |
+
final_data.update({
|
| 111 |
+
"url": url,
|
| 112 |
+
"title": final_data.get("title") or (soup.title.string if soup.title else None),
|
| 113 |
+
**content_data
|
| 114 |
+
})
|
| 115 |
+
|
| 116 |
+
await context.push_data(final_data)
|
| 117 |
+
|
| 118 |
+
# 2. Search Page or Tag Page
|
| 119 |
+
elif "medium.com/search" in url or "/tag/" in url:
|
| 120 |
+
Actor.log.info(f"Scraping Listing Page: {url}")
|
| 121 |
+
|
| 122 |
+
loop = asyncio.get_running_loop()
|
| 123 |
+
results = await loop.run_in_executor(None, extract_search_results, soup, url)
|
| 124 |
+
Actor.log.info(f"Found {len(results)} articles.")
|
| 125 |
+
|
| 126 |
+
pushed = 0
|
| 127 |
+
for rec in results:
|
| 128 |
+
if pushed >= actor_input.max_articles:
|
| 129 |
+
break
|
| 130 |
+
|
| 131 |
+
full_url = rec["url"]
|
| 132 |
+
|
| 133 |
+
# Deduplication Check
|
| 134 |
+
if state_manager and state_manager.is_seen(full_url):
|
| 135 |
+
Actor.log.info(f"Skipping seen URL: {full_url}")
|
| 136 |
+
continue
|
| 137 |
+
|
| 138 |
+
# Add to state
|
| 139 |
+
if state_manager:
|
| 140 |
+
state_manager.add_seen(full_url)
|
| 141 |
+
|
| 142 |
+
if actor_input.scrape_full_content:
|
| 143 |
+
# Enqueue for deep scraping
|
| 144 |
+
await context.add_requests([Request.from_url(
|
| 145 |
+
url=full_url,
|
| 146 |
+
label="ARTICLE",
|
| 147 |
+
user_data={
|
| 148 |
+
"title": rec.get("title"),
|
| 149 |
+
"author": rec.get("author"),
|
| 150 |
+
"publishingDate": rec.get("publishingDate"),
|
| 151 |
+
"readingTime": rec.get("readingTime"),
|
| 152 |
+
"search_query": actor_input.search_query
|
| 153 |
+
}
|
| 154 |
+
)])
|
| 155 |
+
else:
|
| 156 |
+
# Fast mode
|
| 157 |
+
await context.push_data(rec)
|
| 158 |
+
|
| 159 |
+
pushed += 1
|
| 160 |
+
|
| 161 |
+
# Push search page summary
|
| 162 |
+
await context.push_data({
|
| 163 |
+
"type": "search_page",
|
| 164 |
+
"url": url,
|
| 165 |
+
"enqueued": pushed
|
| 166 |
+
})
|
| 167 |
+
|
| 168 |
+
Actor.log.info(f"Starting crawler with URLs: {start_urls}")
|
| 169 |
+
await crawler.run(start_urls)
|
| 170 |
+
|
| 171 |
+
except Exception as e:
|
| 172 |
+
Actor.log.error(f"Crawler failed: {e}")
|
| 173 |
+
raise
|
| 174 |
+
finally:
|
| 175 |
+
if state_manager:
|
| 176 |
+
await state_manager.save_state()
|
| 177 |
+
await Actor.exit()
|
| 178 |
+
|
| 179 |
+
if __name__ == "__main__":
|
| 180 |
+
asyncio.run(main())
|
src/parser.py
ADDED
|
@@ -0,0 +1,309 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from bs4 import BeautifulSoup
|
| 2 |
+
from typing import Dict, List, Optional, Any
|
| 3 |
+
from markdownify import markdownify as md
|
| 4 |
+
from urllib.parse import urljoin
|
| 5 |
+
|
| 6 |
+
def extract_search_results(soup: BeautifulSoup, base_url: str) -> List[Dict[str, Any]]:
|
| 7 |
+
"""
|
| 8 |
+
Extracts article metadata from search result cards.
|
| 9 |
+
"""
|
| 10 |
+
results = []
|
| 11 |
+
|
| 12 |
+
# Selectors for article cards
|
| 13 |
+
# Try multiple selectors as Medium's DOM changes
|
| 14 |
+
cards = soup.select("article") or \
|
| 15 |
+
soup.select('div[role="article"]') or \
|
| 16 |
+
soup.select(".postArticle") or \
|
| 17 |
+
soup.select(".js-block")
|
| 18 |
+
|
| 19 |
+
for card in cards:
|
| 20 |
+
data = _extract_from_card(card, base_url)
|
| 21 |
+
if data.get("url"):
|
| 22 |
+
results.append(data)
|
| 23 |
+
|
| 24 |
+
return results
|
| 25 |
+
|
| 26 |
+
def _extract_from_card(card, base_url: str) -> Dict[str, Any]:
|
| 27 |
+
"""Helper to extract data from a single card element."""
|
| 28 |
+
# 1. URL & Title
|
| 29 |
+
# Look for <a> tags that link to the article
|
| 30 |
+
# Usually the first <h2> inside an <a> is the title
|
| 31 |
+
title_tag = card.find("h2")
|
| 32 |
+
title = title_tag.get_text(strip=True) if title_tag else None
|
| 33 |
+
|
| 34 |
+
# Find the link associated with the title or the card
|
| 35 |
+
link_tag = card.find("a", href=True)
|
| 36 |
+
if title_tag and title_tag.find_parent("a"):
|
| 37 |
+
link_tag = title_tag.find_parent("a")
|
| 38 |
+
|
| 39 |
+
url = None
|
| 40 |
+
if link_tag:
|
| 41 |
+
href = link_tag["href"]
|
| 42 |
+
# Clean up URL (remove query params usually)
|
| 43 |
+
if "?" in href:
|
| 44 |
+
href = href.split("?")[0]
|
| 45 |
+
url = urljoin(base_url, href)
|
| 46 |
+
|
| 47 |
+
# 2. Author
|
| 48 |
+
# Heuristic: Look for links that go to a user profile (/@username or /u/username)
|
| 49 |
+
# but aren't the main article link.
|
| 50 |
+
author = None
|
| 51 |
+
|
| 52 |
+
# Try specific selectors first
|
| 53 |
+
author_tag = card.select_one('a[data-action="show-user-card"]') or \
|
| 54 |
+
card.select_one('.ds-link') or \
|
| 55 |
+
card.select_one('a[href*="/@"]')
|
| 56 |
+
|
| 57 |
+
if author_tag:
|
| 58 |
+
# Verify it's not the title link
|
| 59 |
+
if title_tag and author_tag == title_tag.find_parent("a"):
|
| 60 |
+
pass # It's the title
|
| 61 |
+
else:
|
| 62 |
+
author = author_tag.get_text(strip=True)
|
| 63 |
+
|
| 64 |
+
# Fallback: Look for a <p> or <span> that contains the author name
|
| 65 |
+
# Usually it's the first piece of text in the card meta area
|
| 66 |
+
if not author:
|
| 67 |
+
# Find the meta div (often has date/read time)
|
| 68 |
+
# We look for text that is NOT the date or read time
|
| 69 |
+
for p in card.find_all(["p", "span"]):
|
| 70 |
+
txt = p.get_text(strip=True)
|
| 71 |
+
# Skip empty, date-like, or read-time strings
|
| 72 |
+
if not txt or "min read" in txt or any(m in txt for m in ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec", "ago"]):
|
| 73 |
+
continue
|
| 74 |
+
# Skip title
|
| 75 |
+
if title and txt in title:
|
| 76 |
+
continue
|
| 77 |
+
|
| 78 |
+
# If it looks like a name (2-3 words, capitalized), take it
|
| 79 |
+
if 0 < len(txt.split()) <= 3 and txt[0].isupper():
|
| 80 |
+
author = txt
|
| 81 |
+
break
|
| 82 |
+
|
| 83 |
+
# 3. Date / Reading Time
|
| 84 |
+
# Often spans
|
| 85 |
+
spans = card.find_all("span")
|
| 86 |
+
pub_date = None
|
| 87 |
+
reading_time = None
|
| 88 |
+
|
| 89 |
+
for s in spans:
|
| 90 |
+
txt = s.get_text(strip=True)
|
| 91 |
+
# Reading time usually ends with "min read"
|
| 92 |
+
if "min read" in txt:
|
| 93 |
+
try:
|
| 94 |
+
reading_time = float(txt.replace("min read", "").strip())
|
| 95 |
+
except ValueError:
|
| 96 |
+
pass
|
| 97 |
+
# Date heuristic: "Nov 7" or "2 days ago"
|
| 98 |
+
# Hard to parse perfectly without regex, but we can grab it if it looks like a date
|
| 99 |
+
# For now, we might skip complex date parsing or just take the first span that isn't reading time
|
| 100 |
+
elif not pub_date and len(txt) < 15 and any(c.isdigit() for c in txt):
|
| 101 |
+
# Very rough heuristic
|
| 102 |
+
pub_date = txt
|
| 103 |
+
|
| 104 |
+
# 4. Image URL
|
| 105 |
+
# Priority:
|
| 106 |
+
# 1. <img src="..." class="..."/> inside the card (often has specific classes for covers)
|
| 107 |
+
# 2. First <img> tag in the card
|
| 108 |
+
# Note: Search results don't always have og:image tags (those are in the head), so we must rely on the card's HTML.
|
| 109 |
+
image_url = None
|
| 110 |
+
|
| 111 |
+
# Try to find the main article image (often has specific classes or sizes)
|
| 112 |
+
# Medium uses responsive images, often in <picture> or <img> with srcset.
|
| 113 |
+
# We'll look for the largest image or the first one that isn't an avatar.
|
| 114 |
+
|
| 115 |
+
images = card.find_all("img")
|
| 116 |
+
for img in images:
|
| 117 |
+
src = img.get("src", "")
|
| 118 |
+
# Skip small avatars (often 20x20 or similar in URL)
|
| 119 |
+
if "1*dmbNkD5D-u45r44go_cf0g.png" in src: # Common default avatar
|
| 120 |
+
continue
|
| 121 |
+
if "resize:fill:20:20" in src: # Tiny thumbnail
|
| 122 |
+
continue
|
| 123 |
+
|
| 124 |
+
# If it's a valid image, take it.
|
| 125 |
+
# Medium images often have 'cdn-images-1.medium.com'
|
| 126 |
+
if src:
|
| 127 |
+
image_url = src
|
| 128 |
+
break
|
| 129 |
+
|
| 130 |
+
if not image_url:
|
| 131 |
+
# Fallback to any img
|
| 132 |
+
img_tag = card.find("img")
|
| 133 |
+
if img_tag and img_tag.get("src"):
|
| 134 |
+
image_url = img_tag["src"]
|
| 135 |
+
|
| 136 |
+
return {
|
| 137 |
+
"url": url,
|
| 138 |
+
"title": title,
|
| 139 |
+
"author": {"name": author} if author else None,
|
| 140 |
+
"publishingDate": pub_date,
|
| 141 |
+
"readingTime": reading_time,
|
| 142 |
+
"imageUrl": image_url,
|
| 143 |
+
}
|
| 144 |
+
|
| 145 |
+
def extract_article_content(soup: BeautifulSoup, url: Optional[str] = None) -> Dict[str, Any]:
|
| 146 |
+
"""
|
| 147 |
+
Extracts full content, claps, and responses from an article page.
|
| 148 |
+
If extraction fails (Cloudflare/paywall), falls back to URL parsing.
|
| 149 |
+
"""
|
| 150 |
+
content_data = {
|
| 151 |
+
"markdownContent": None,
|
| 152 |
+
"claps": None,
|
| 153 |
+
"responses": None,
|
| 154 |
+
"title": None,
|
| 155 |
+
"author": None,
|
| 156 |
+
"publication": None # New field to track publication separately from author
|
| 157 |
+
}
|
| 158 |
+
|
| 159 |
+
# Extract Title (with fallbacks)
|
| 160 |
+
# Try h1 first
|
| 161 |
+
title_tag = soup.find("h1")
|
| 162 |
+
if title_tag:
|
| 163 |
+
content_data["title"] = title_tag.get_text(strip=True)
|
| 164 |
+
|
| 165 |
+
# Try og:title
|
| 166 |
+
if not content_data["title"]:
|
| 167 |
+
og_title = soup.find("meta", property="og:title")
|
| 168 |
+
if og_title and og_title.get("content"):
|
| 169 |
+
content_data["title"] = og_title.get("content")
|
| 170 |
+
|
| 171 |
+
# Try URL parsing if title is empty or generic (Cloudflare/Medium homepage)
|
| 172 |
+
is_generic_title = content_data["title"] in [None, "", "Just a moment...", "medium.com", "Medium"]
|
| 173 |
+
if is_generic_title and url:
|
| 174 |
+
# Medium URLs are like: https://medium.com/@author/article-title-slug-hash
|
| 175 |
+
# or https://medium.com/publication/article-title-slug-hash
|
| 176 |
+
try:
|
| 177 |
+
from urllib.parse import urlparse
|
| 178 |
+
path_parts = urlparse(url).path.strip("/").split("/")
|
| 179 |
+
if len(path_parts) >= 2:
|
| 180 |
+
# Last part is the article slug
|
| 181 |
+
article_slug = path_parts[-1]
|
| 182 |
+
# Remove hash (last part after last hyphen if it's alphanumeric)
|
| 183 |
+
slug_parts = article_slug.rsplit("-", 1)
|
| 184 |
+
if len(slug_parts) > 1 and len(slug_parts[-1]) == 12: # Medium hash is 12 chars
|
| 185 |
+
article_slug = slug_parts[0]
|
| 186 |
+
# Convert slug to title: replace-hyphens-with-spaces
|
| 187 |
+
title = article_slug.replace("-", " ").title()
|
| 188 |
+
content_data["title"] = title
|
| 189 |
+
except Exception:
|
| 190 |
+
pass
|
| 191 |
+
|
| 192 |
+
# Last resort: Try page title element (will be "medium.com" or "Just a moment..." for Cloudflare)
|
| 193 |
+
if not content_data["title"]:
|
| 194 |
+
title_elem = soup.find("title")
|
| 195 |
+
if title_elem:
|
| 196 |
+
page_title = title_elem.get_text(strip=True)
|
| 197 |
+
# Only use if it's not a Cloudflare/generic page
|
| 198 |
+
if page_title and page_title not in ["Just a moment...", "medium.com", "Medium"]:
|
| 199 |
+
content_data["title"] = page_title
|
| 200 |
+
|
| 201 |
+
# Extract Author
|
| 202 |
+
# Meta tag is reliable: <meta name="author" content="...">
|
| 203 |
+
meta_author = soup.find("meta", attrs={"name": "author"})
|
| 204 |
+
if meta_author and meta_author.get("content"):
|
| 205 |
+
content_data["author"] = {"name": meta_author.get("content")}
|
| 206 |
+
else:
|
| 207 |
+
# Fallback to selectors
|
| 208 |
+
author_tag = soup.select_one('a[data-action="show-user-card"]') or soup.select_one('.ds-link')
|
| 209 |
+
if author_tag:
|
| 210 |
+
author_text = author_tag.get_text(strip=True)
|
| 211 |
+
if author_text: # Only set if we got actual text
|
| 212 |
+
content_data["author"] = {"name": author_text}
|
| 213 |
+
|
| 214 |
+
# Extract publication or author from URL (metadata extraction)
|
| 215 |
+
if url:
|
| 216 |
+
try:
|
| 217 |
+
from urllib.parse import urlparse
|
| 218 |
+
path_parts = urlparse(url).path.strip("/").split("/")
|
| 219 |
+
if len(path_parts) >= 1:
|
| 220 |
+
first_part = path_parts[0]
|
| 221 |
+
# Check for @username format (personal blog)
|
| 222 |
+
if first_part.startswith("@"):
|
| 223 |
+
username = first_part[1:] # Remove @ symbol
|
| 224 |
+
formatted_name = username.replace("-", " ").title()
|
| 225 |
+
# If we don't have an author yet, use the username
|
| 226 |
+
if not content_data["author"]:
|
| 227 |
+
content_data["author"] = {"name": formatted_name}
|
| 228 |
+
# Otherwise it's a publication name (like "ai-in-plain-english")
|
| 229 |
+
else:
|
| 230 |
+
pub_name = first_part.replace("-", " ").title()
|
| 231 |
+
content_data["publication"] = pub_name
|
| 232 |
+
# Only use publication as author if we have absolutely no author info
|
| 233 |
+
# (Note: This is not ideal but better than nothing for blocked pages)
|
| 234 |
+
except Exception:
|
| 235 |
+
pass
|
| 236 |
+
|
| 237 |
+
# Pre-extract og:description for fallback (before attempting main extraction)
|
| 238 |
+
og_description = soup.find("meta", property="og:description")
|
| 239 |
+
fallback_description = og_description.get("content") if og_description else None
|
| 240 |
+
|
| 241 |
+
# Extract Claps
|
| 242 |
+
try:
|
| 243 |
+
clap_el = soup.select_one('button[data-testid="clapCount"]') or soup.select_one('.clapCount')
|
| 244 |
+
if clap_el:
|
| 245 |
+
txt = clap_el.get_text(strip=True)
|
| 246 |
+
if "K" in txt:
|
| 247 |
+
content_data["claps"] = int(float(txt.replace("K", "")) * 1000)
|
| 248 |
+
else:
|
| 249 |
+
content_data["claps"] = int(txt)
|
| 250 |
+
except Exception:
|
| 251 |
+
pass
|
| 252 |
+
|
| 253 |
+
# Extract Responses
|
| 254 |
+
try:
|
| 255 |
+
resp_el = soup.select_one('button[data-testid="responsesCount"]') or soup.select_one('.responsesCount')
|
| 256 |
+
if resp_el:
|
| 257 |
+
txt = resp_el.get_text(strip=True)
|
| 258 |
+
content_data["responses"] = int(txt)
|
| 259 |
+
except Exception:
|
| 260 |
+
pass
|
| 261 |
+
|
| 262 |
+
# Extract Content
|
| 263 |
+
article = soup.find("article") or soup.find("section")
|
| 264 |
+
if article:
|
| 265 |
+
# Remove clutter
|
| 266 |
+
for tag in article.select("button, .speechify-btn, .metabar, footer"):
|
| 267 |
+
tag.decompose()
|
| 268 |
+
|
| 269 |
+
html = str(article)
|
| 270 |
+
content_data["markdownContent"] = md(html, heading_style="ATX")
|
| 271 |
+
|
| 272 |
+
# Fallback 1: Try to extract first few paragraphs (intro/header) even if article tag failed
|
| 273 |
+
if not content_data["markdownContent"] or len(content_data["markdownContent"]) < 100:
|
| 274 |
+
# Look for any paragraphs in the page (might be intro text that loaded before paywall)
|
| 275 |
+
paragraphs = soup.find_all("p")
|
| 276 |
+
if paragraphs:
|
| 277 |
+
# Get first 3-5 paragraphs that have substantial content
|
| 278 |
+
intro_text = []
|
| 279 |
+
for p in paragraphs[:10]: # Check first 10 paragraphs
|
| 280 |
+
text = p.get_text(strip=True)
|
| 281 |
+
# Skip short paragraphs (likely meta info) and certain patterns
|
| 282 |
+
if len(text) > 50 and "min read" not in text.lower() and "ago" not in text:
|
| 283 |
+
intro_text.append(text)
|
| 284 |
+
if len(intro_text) >= 3: # Got enough intro paragraphs
|
| 285 |
+
break
|
| 286 |
+
|
| 287 |
+
if intro_text:
|
| 288 |
+
combined_intro = "\n\n".join(intro_text)
|
| 289 |
+
if not content_data["markdownContent"]:
|
| 290 |
+
content_data["markdownContent"] = combined_intro
|
| 291 |
+
else:
|
| 292 |
+
# Append intro to existing content if it was too short
|
| 293 |
+
content_data["markdownContent"] += "\n\n" + combined_intro
|
| 294 |
+
|
| 295 |
+
# Fallback 2: Meta Description (if still no content)
|
| 296 |
+
if not content_data["markdownContent"] or len(content_data["markdownContent"]) < 50:
|
| 297 |
+
if fallback_description:
|
| 298 |
+
desc_text = f"Summary: {fallback_description}"
|
| 299 |
+
if content_data["markdownContent"]:
|
| 300 |
+
content_data["markdownContent"] = desc_text + "\n\n" + content_data["markdownContent"]
|
| 301 |
+
else:
|
| 302 |
+
content_data["markdownContent"] = desc_text
|
| 303 |
+
else:
|
| 304 |
+
# Last resort: try name="description"
|
| 305 |
+
meta_desc = soup.find("meta", attrs={"name": "description"})
|
| 306 |
+
if meta_desc:
|
| 307 |
+
content_data["markdownContent"] = f"Summary: {meta_desc.get('content', '')}"
|
| 308 |
+
|
| 309 |
+
return content_data
|
src/py.typed
ADDED
|
File without changes
|
src/service.py
ADDED
|
@@ -0,0 +1,279 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
from playwright.async_api import async_playwright, Browser, BrowserContext, Page
|
| 3 |
+
from bs4 import BeautifulSoup
|
| 4 |
+
from typing import List, Dict, Any, Optional, Callable, Awaitable
|
| 5 |
+
import logging
|
| 6 |
+
|
| 7 |
+
# Reuse existing logic
|
| 8 |
+
from src.parser import extract_search_results, extract_article_content
|
| 9 |
+
from src.utils import block_resources
|
| 10 |
+
|
| 11 |
+
# Configure logging
|
| 12 |
+
logging.basicConfig(level=logging.INFO)
|
| 13 |
+
logger = logging.getLogger("ScraperService")
|
| 14 |
+
|
| 15 |
+
class ScraperWorker:
|
| 16 |
+
"""
|
| 17 |
+
A single worker that manages its own BrowserContext.
|
| 18 |
+
"""
|
| 19 |
+
def __init__(self, worker_id: int, browser: Browser):
|
| 20 |
+
self.worker_id = worker_id
|
| 21 |
+
self.browser = browser
|
| 22 |
+
self.context: Optional[BrowserContext] = None
|
| 23 |
+
self._lock = asyncio.Lock()
|
| 24 |
+
|
| 25 |
+
async def ensure_context(self):
|
| 26 |
+
"""Ensures this worker has an open context."""
|
| 27 |
+
async with self._lock:
|
| 28 |
+
if not self.context:
|
| 29 |
+
logger.info(f"[Worker {self.worker_id}] Creating context...")
|
| 30 |
+
self.context = await self.browser.new_context()
|
| 31 |
+
# Optional: Block resources globally for this context if possible,
|
| 32 |
+
# but route is usually per-page.
|
| 33 |
+
|
| 34 |
+
async def scrape_search(self, query: str, max_articles: int, progress_callback: Optional[Callable[[str], Awaitable[None]]] = None) -> List[Dict[str, Any]]:
|
| 35 |
+
"""Scrapes search results using this worker's context."""
|
| 36 |
+
if progress_callback:
|
| 37 |
+
await progress_callback(f"[Worker {self.worker_id}] Starting search for '{query}'...")
|
| 38 |
+
|
| 39 |
+
await self.ensure_context()
|
| 40 |
+
page = await self.context.new_page()
|
| 41 |
+
|
| 42 |
+
try:
|
| 43 |
+
# Block resources
|
| 44 |
+
# await page.route("**/*", block_resources)
|
| 45 |
+
|
| 46 |
+
# Set User Agent
|
| 47 |
+
await page.set_extra_http_headers({
|
| 48 |
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
|
| 49 |
+
})
|
| 50 |
+
|
| 51 |
+
url = f"https://medium.com/search?q={query.replace(' ', '+')}"
|
| 52 |
+
logger.info(f"[Worker {self.worker_id}] Navigating to: {url}")
|
| 53 |
+
if progress_callback:
|
| 54 |
+
await progress_callback(f"[Worker {self.worker_id}] Navigating to {url}...")
|
| 55 |
+
await page.goto(url, wait_until="domcontentloaded")
|
| 56 |
+
|
| 57 |
+
try:
|
| 58 |
+
await page.wait_for_selector("article, div[role='article'], .postArticle, .js-block", timeout=30000)
|
| 59 |
+
except Exception as e:
|
| 60 |
+
logger.warning(f"[Worker {self.worker_id}] Timeout waiting for selectors: {e}")
|
| 61 |
+
|
| 62 |
+
html = await page.content()
|
| 63 |
+
soup = BeautifulSoup(html, "html.parser")
|
| 64 |
+
|
| 65 |
+
results = extract_search_results(soup, url)
|
| 66 |
+
return results[:max_articles]
|
| 67 |
+
|
| 68 |
+
finally:
|
| 69 |
+
await page.close()
|
| 70 |
+
|
| 71 |
+
async def scrape_tag(self, tag: str, max_articles: int, progress_callback: Optional[Callable[[str], Awaitable[None]]] = None) -> List[Dict[str, Any]]:
|
| 72 |
+
"""Scrapes tag results using this worker's context."""
|
| 73 |
+
if progress_callback:
|
| 74 |
+
await progress_callback(f"[Worker {self.worker_id}] Starting tag scrape for '{tag}'...")
|
| 75 |
+
|
| 76 |
+
await self.ensure_context()
|
| 77 |
+
page = await self.context.new_page()
|
| 78 |
+
|
| 79 |
+
try:
|
| 80 |
+
# Set User Agent
|
| 81 |
+
await page.set_extra_http_headers({
|
| 82 |
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
|
| 83 |
+
})
|
| 84 |
+
|
| 85 |
+
tag_slug = tag.lower().replace(" ", "-")
|
| 86 |
+
url = f"https://medium.com/tag/{tag_slug}"
|
| 87 |
+
logger.info(f"[Worker {self.worker_id}] Navigating to: {url}")
|
| 88 |
+
if progress_callback:
|
| 89 |
+
await progress_callback(f"[Worker {self.worker_id}] Navigating to {url}...")
|
| 90 |
+
await page.goto(url, wait_until="domcontentloaded")
|
| 91 |
+
|
| 92 |
+
try:
|
| 93 |
+
await page.wait_for_selector("article, div[role='article'], .postArticle, .js-block", timeout=30000)
|
| 94 |
+
except Exception as e:
|
| 95 |
+
logger.warning(f"[Worker {self.worker_id}] Timeout waiting for selectors: {e}")
|
| 96 |
+
|
| 97 |
+
html = await page.content()
|
| 98 |
+
soup = BeautifulSoup(html, "html.parser")
|
| 99 |
+
|
| 100 |
+
results = extract_search_results(soup, url)
|
| 101 |
+
return results[:max_articles]
|
| 102 |
+
|
| 103 |
+
finally:
|
| 104 |
+
await page.close()
|
| 105 |
+
|
| 106 |
+
async def close(self):
|
| 107 |
+
if self.context:
|
| 108 |
+
await self.context.close()
|
| 109 |
+
|
| 110 |
+
async def scrape_article(self, url: str) -> Dict[str, Any]:
|
| 111 |
+
"""Scrapes full article content using this worker's context."""
|
| 112 |
+
await self.ensure_context()
|
| 113 |
+
page = await self.context.new_page()
|
| 114 |
+
|
| 115 |
+
try:
|
| 116 |
+
# Stealth: Remove webdriver property
|
| 117 |
+
await page.add_init_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
|
| 118 |
+
|
| 119 |
+
# Set standard User Agent
|
| 120 |
+
await page.set_extra_http_headers({
|
| 121 |
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
| 122 |
+
})
|
| 123 |
+
|
| 124 |
+
logger.info(f"[Worker {self.worker_id}] Navigating to article: {url}")
|
| 125 |
+
|
| 126 |
+
# Anti-bot: Go to homepage first to set cookies
|
| 127 |
+
await page.goto("https://medium.com/", wait_until="domcontentloaded")
|
| 128 |
+
await page.wait_for_timeout(1500)
|
| 129 |
+
|
| 130 |
+
# Navigate to article
|
| 131 |
+
await page.goto(url, wait_until="domcontentloaded", timeout=20000)
|
| 132 |
+
|
| 133 |
+
# Wait longer for dynamic content to load
|
| 134 |
+
await page.wait_for_timeout(3000)
|
| 135 |
+
|
| 136 |
+
try:
|
| 137 |
+
# Wait for the main article content
|
| 138 |
+
# Try multiple selectors
|
| 139 |
+
await page.wait_for_selector("article, section, main, div[role='main'], h1", timeout=20000)
|
| 140 |
+
|
| 141 |
+
# Scroll to bottom to trigger lazy loading
|
| 142 |
+
for i in range(5):
|
| 143 |
+
await page.evaluate("window.scrollBy(0, window.innerHeight)")
|
| 144 |
+
await page.wait_for_timeout(500)
|
| 145 |
+
|
| 146 |
+
# Wait a bit more after scrolling
|
| 147 |
+
await page.wait_for_timeout(2000)
|
| 148 |
+
|
| 149 |
+
except Exception as e:
|
| 150 |
+
logger.warning(f"[Worker {self.worker_id}] Timeout waiting for article selectors: {e}")
|
| 151 |
+
|
| 152 |
+
# Fallback: Try Google Cache (Text Only)
|
| 153 |
+
try:
|
| 154 |
+
logger.info(f"[Worker {self.worker_id}] Trying Google Cache for: {url}")
|
| 155 |
+
cache_url = f"http://webcache.googleusercontent.com/search?q=cache:{url}&strip=1"
|
| 156 |
+
await page.goto(cache_url, wait_until="domcontentloaded", timeout=15000)
|
| 157 |
+
|
| 158 |
+
# Google Cache (Text Only) usually puts content in <pre> or just body
|
| 159 |
+
# We'll let the standard extractor try, or just grab body
|
| 160 |
+
await page.wait_for_selector("body", timeout=5000)
|
| 161 |
+
|
| 162 |
+
except Exception as e2:
|
| 163 |
+
logger.warning(f"[Worker {self.worker_id}] Google Cache failed: {e2}")
|
| 164 |
+
|
| 165 |
+
html = await page.content()
|
| 166 |
+
soup = BeautifulSoup(html, "html.parser")
|
| 167 |
+
|
| 168 |
+
content = extract_article_content(soup, url=url) # Pass URL for fallback parsing
|
| 169 |
+
content["html_debug"] = html # For debugging
|
| 170 |
+
|
| 171 |
+
# Fallback if markdown is empty
|
| 172 |
+
if not content.get("markdownContent"):
|
| 173 |
+
# Try to just get text from body as a last resort
|
| 174 |
+
body = soup.find("body")
|
| 175 |
+
if body:
|
| 176 |
+
text = body.get_text(separator="\n", strip=True)
|
| 177 |
+
# Clean up a bit
|
| 178 |
+
if len(text) > 500:
|
| 179 |
+
content["markdownContent"] = text[:5000] # Limit fallback text
|
| 180 |
+
else:
|
| 181 |
+
content["markdownContent"] = "Could not extract article content. It might be behind a paywall or login."
|
| 182 |
+
|
| 183 |
+
return content
|
| 184 |
+
|
| 185 |
+
finally:
|
| 186 |
+
await page.close()
|
| 187 |
+
|
| 188 |
+
|
| 189 |
+
class ScraperService:
|
| 190 |
+
"""
|
| 191 |
+
Manages a pool of ScraperWorkers for concurrent scraping.
|
| 192 |
+
"""
|
| 193 |
+
def __init__(self, max_workers: int = 5, headless: bool = True):
|
| 194 |
+
self.max_workers = max_workers
|
| 195 |
+
self.headless = headless
|
| 196 |
+
self.playwright = None
|
| 197 |
+
self.browser: Optional[Browser] = None
|
| 198 |
+
self.workers: List[ScraperWorker] = []
|
| 199 |
+
self.worker_queue = asyncio.Queue()
|
| 200 |
+
self._initialized = False
|
| 201 |
+
self._lock = asyncio.Lock()
|
| 202 |
+
|
| 203 |
+
async def ensure_initialized(self):
|
| 204 |
+
"""Starts Playwright, Browser, and Workers."""
|
| 205 |
+
async with self._lock:
|
| 206 |
+
# Check if browser is alive
|
| 207 |
+
if self.browser and not self.browser.is_connected():
|
| 208 |
+
logger.warning("Browser is disconnected. Restarting...")
|
| 209 |
+
await self.close()
|
| 210 |
+
self._initialized = False
|
| 211 |
+
self.workers = []
|
| 212 |
+
self.worker_queue = asyncio.Queue()
|
| 213 |
+
|
| 214 |
+
if self._initialized:
|
| 215 |
+
return
|
| 216 |
+
|
| 217 |
+
logger.info("Initializing Scraper Service...")
|
| 218 |
+
self.playwright = await async_playwright().start()
|
| 219 |
+
self.browser = await self.playwright.chromium.launch(headless=self.headless)
|
| 220 |
+
|
| 221 |
+
# Create Workers
|
| 222 |
+
self.workers = [] # Reset workers
|
| 223 |
+
for i in range(self.max_workers):
|
| 224 |
+
worker = ScraperWorker(i, self.browser)
|
| 225 |
+
self.workers.append(worker)
|
| 226 |
+
await self.worker_queue.put(worker)
|
| 227 |
+
|
| 228 |
+
self._initialized = True
|
| 229 |
+
logger.info(f"Initialized {self.max_workers} workers.")
|
| 230 |
+
|
| 231 |
+
async def _get_worker(self) -> ScraperWorker:
|
| 232 |
+
"""Retrieves a free worker from the queue."""
|
| 233 |
+
# Check connection before getting worker
|
| 234 |
+
if self.browser and not self.browser.is_connected():
|
| 235 |
+
logger.warning("Browser disconnected in _get_worker. Re-initializing...")
|
| 236 |
+
await self.ensure_initialized()
|
| 237 |
+
|
| 238 |
+
await self.ensure_initialized()
|
| 239 |
+
return await self.worker_queue.get()
|
| 240 |
+
|
| 241 |
+
async def _release_worker(self, worker: ScraperWorker):
|
| 242 |
+
"""Returns a worker to the queue."""
|
| 243 |
+
await self.worker_queue.put(worker)
|
| 244 |
+
|
| 245 |
+
async def scrape_search(self, query: str, max_articles: int = 5, progress_callback: Optional[Callable[[str], Awaitable[None]]] = None) -> List[Dict[str, Any]]:
|
| 246 |
+
"""Delegates search to a free worker."""
|
| 247 |
+
worker = await self._get_worker()
|
| 248 |
+
try:
|
| 249 |
+
return await worker.scrape_search(query, max_articles, progress_callback)
|
| 250 |
+
finally:
|
| 251 |
+
await self._release_worker(worker)
|
| 252 |
+
|
| 253 |
+
async def scrape_tag(self, tag: str, max_articles: int = 5, progress_callback: Optional[Callable[[str], Awaitable[None]]] = None) -> List[Dict[str, Any]]:
|
| 254 |
+
"""Delegates tag scrape to a free worker."""
|
| 255 |
+
worker = await self._get_worker()
|
| 256 |
+
try:
|
| 257 |
+
return await worker.scrape_tag(tag, max_articles, progress_callback)
|
| 258 |
+
finally:
|
| 259 |
+
await self._release_worker(worker)
|
| 260 |
+
|
| 261 |
+
async def close(self):
|
| 262 |
+
"""Closes all workers and the browser."""
|
| 263 |
+
logger.info("Closing ScraperService...")
|
| 264 |
+
for worker in self.workers:
|
| 265 |
+
await worker.close()
|
| 266 |
+
|
| 267 |
+
if self.browser:
|
| 268 |
+
await self.browser.close()
|
| 269 |
+
if self.playwright:
|
| 270 |
+
await self.playwright.stop()
|
| 271 |
+
|
| 272 |
+
async def scrape_article(self, url: str) -> Dict[str, Any]:
|
| 273 |
+
"""Delegates article scrape to a free worker."""
|
| 274 |
+
worker = await self._get_worker()
|
| 275 |
+
try:
|
| 276 |
+
return await worker.scrape_article(url)
|
| 277 |
+
finally:
|
| 278 |
+
await self._release_worker(worker)
|
| 279 |
+
|
src/state.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from apify import Actor
|
| 2 |
+
from typing import Set
|
| 3 |
+
|
| 4 |
+
class StateManager:
|
| 5 |
+
"""
|
| 6 |
+
Manages the persistent state of the actor, specifically for deduplication.
|
| 7 |
+
"""
|
| 8 |
+
def __init__(self):
|
| 9 |
+
self.seen_urls: Set[str] = set()
|
| 10 |
+
|
| 11 |
+
async def load_state(self):
|
| 12 |
+
"""
|
| 13 |
+
Loads the state from the default key-value store.
|
| 14 |
+
"""
|
| 15 |
+
state = await Actor.get_value("STATE")
|
| 16 |
+
|
| 17 |
+
if not state:
|
| 18 |
+
# Fallback for local development
|
| 19 |
+
import json
|
| 20 |
+
from pathlib import Path
|
| 21 |
+
cwd = Path.cwd()
|
| 22 |
+
local_state = cwd / "local_state.json"
|
| 23 |
+
Actor.log.info(f"Checking for local state at: {local_state}")
|
| 24 |
+
if local_state.exists():
|
| 25 |
+
try:
|
| 26 |
+
state = json.loads(local_state.read_text(encoding="utf-8"))
|
| 27 |
+
Actor.log.info(f"Loaded state from local file: {local_state}")
|
| 28 |
+
except Exception as e:
|
| 29 |
+
Actor.log.warning(f"Failed to load local state: {e}")
|
| 30 |
+
state = {}
|
| 31 |
+
else:
|
| 32 |
+
state = {}
|
| 33 |
+
|
| 34 |
+
self.seen_urls = set(state.get("seen_urls", []))
|
| 35 |
+
Actor.log.info(f"Loaded state: {len(self.seen_urls)} seen URLs.")
|
| 36 |
+
|
| 37 |
+
async def save_state(self):
|
| 38 |
+
"""
|
| 39 |
+
Saves the current state to the default key-value store.
|
| 40 |
+
"""
|
| 41 |
+
state = {
|
| 42 |
+
"seen_urls": list(self.seen_urls)
|
| 43 |
+
}
|
| 44 |
+
await Actor.set_value("STATE", state)
|
| 45 |
+
|
| 46 |
+
# Backup for local development
|
| 47 |
+
import json
|
| 48 |
+
from pathlib import Path
|
| 49 |
+
cwd = Path.cwd()
|
| 50 |
+
local_state = cwd / "local_state.json"
|
| 51 |
+
try:
|
| 52 |
+
local_state.write_text(json.dumps(state, indent=2), encoding="utf-8")
|
| 53 |
+
Actor.log.info(f"Backed up state to local file: {local_state}")
|
| 54 |
+
except Exception as e:
|
| 55 |
+
Actor.log.warning(f"Failed to backup local state: {e}")
|
| 56 |
+
|
| 57 |
+
Actor.log.info(f"Saved state: {len(self.seen_urls)} seen URLs.")
|
| 58 |
+
|
| 59 |
+
def is_seen(self, url: str) -> bool:
|
| 60 |
+
"""
|
| 61 |
+
Checks if a URL has already been seen.
|
| 62 |
+
"""
|
| 63 |
+
return url in self.seen_urls
|
| 64 |
+
|
| 65 |
+
def add_seen(self, url: str):
|
| 66 |
+
"""
|
| 67 |
+
Adds a URL to the set of seen URLs.
|
| 68 |
+
"""
|
| 69 |
+
self.seen_urls.add(url)
|
src/utils.py
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from apify import Actor
|
| 2 |
+
|
| 3 |
+
async def block_resources(route):
|
| 4 |
+
"""
|
| 5 |
+
Blocks unnecessary resources to speed up scraping.
|
| 6 |
+
"""
|
| 7 |
+
if route.request.resource_type in ["image", "stylesheet", "font", "media"]:
|
| 8 |
+
await route.abort()
|
| 9 |
+
else:
|
| 10 |
+
await route.continue_()
|