Nikhil Pravin Pise commited on
Commit
a80eeb8
·
1 Parent(s): 81035fa

Initial deploy

Browse files
.env ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ # Add your API keys in Hugging Face Space Settings (Secrets)
2
+ # GEMINI_API_KEY=
3
+ # OPENAI_API_KEY=
4
+ # ELEVENLABS_API_KEY=
Dockerfile ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ # Set working directory
4
+ WORKDIR /app
5
+
6
+ # Install system dependencies
7
+ RUN apt-get update && apt-get install -y \
8
+ git \
9
+ wget \
10
+ gnupg \
11
+ && rm -rf /var/lib/apt/lists/*
12
+
13
+ # Copy requirements first to leverage cache
14
+ COPY requirements.txt .
15
+
16
+ # Install Python dependencies
17
+ RUN pip install --no-cache-dir -r requirements.txt
18
+
19
+ # Install Playwright and browsers
20
+ RUN playwright install --with-deps chromium
21
+
22
+ # Copy the rest of the application
23
+ COPY . .
24
+
25
+ # Set environment variables
26
+ ENV PYTHONPATH=/app
27
+ ENV PYTHONUNBUFFERED=1
28
+
29
+ # Expose port 7860 for Gradio
30
+ EXPOSE 7860
31
+
32
+ # Run the application
33
+ CMD ["python", "app.py"]
README.md CHANGED
@@ -1,12 +1,13 @@
1
  ---
2
- title: Medium MCP
3
- emoji: 😻
4
- colorFrom: purple
5
- colorTo: indigo
6
  sdk: docker
7
  pinned: false
8
- license: mit
9
- short_description: A MCP Server with a Scraper built in
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
1
  ---
2
+ title: Medium Agent
3
+ emoji: 📝
4
+ colorFrom: gray
5
+ colorTo: black
6
  sdk: docker
7
  pinned: false
8
+ app_port: 7860
 
9
  ---
10
 
11
+ # Medium Agent
12
+
13
+ A powerful Medium article scraper and audio generator.
app.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import asyncio
3
+ import os
4
+ import sys
5
+ import ast
6
+ from dotenv import load_dotenv
7
+
8
+ # Load environment variables
9
+ load_dotenv()
10
+
11
+ # Import tools from server
12
+ # We assume server.py is in the same directory
13
+ try:
14
+ from server import medium_search, medium_cast, medium_synthesize
15
+ except ImportError:
16
+ # If running locally with different structure, try to adjust path
17
+ sys.path.append(os.path.dirname(__file__))
18
+ from server import medium_search, medium_cast, medium_synthesize
19
+
20
+ async def search_wrapper(query):
21
+ if not query:
22
+ return "Please enter a query."
23
+
24
+ gr.Info(f"Searching for '{query}'...")
25
+ try:
26
+ # Get string result from tool
27
+ result_str = await medium_search(query)
28
+
29
+ # Parse string back to list
30
+ results = ast.literal_eval(result_str)
31
+
32
+ if not results:
33
+ return "No results found."
34
+
35
+ html = "<div style='display: grid; grid-template-columns: repeat(auto-fill, minmax(300px, 1fr)); gap: 20px;'>"
36
+ for art in results:
37
+ title = art.get('title', 'No Title')
38
+ url = art.get('url', '#')
39
+ author = art.get('author', {}).get('name', 'Unknown') if art.get('author') else 'Unknown'
40
+ publication = art.get('publication', '')
41
+ if publication and author == 'Unknown':
42
+ author = publication
43
+
44
+ img = art.get('imageUrl', '')
45
+
46
+ # Fallback image if empty
47
+ if not img:
48
+ img = "https://miro.medium.com/max/1400/1*jfdwtvU6V6g99q3G7gq7dQ.png"
49
+
50
+ html += f"""
51
+ <div style='border: 1px solid #ddd; border-radius: 8px; overflow: hidden; padding: 0; background: #2b2b2b; color: #fff; display: flex; flex-direction: column;'>
52
+ <div style='height: 160px; background-image: url("{img}"); background-size: cover; background-position: center;'></div>
53
+ <div style='padding: 15px; flex-grow: 1;'>
54
+ <h3 style='margin: 0 0 10px 0; font-size: 16px; line-height: 1.4;'><a href='{url}' target='_blank' style='color: #fff; text-decoration: none;'>{title}</a></h3>
55
+ <p style='margin: 0; font-size: 12px; color: #aaa;'>By {author}</p>
56
+ </div>
57
+ </div>
58
+ """
59
+ html += "</div>"
60
+ return html
61
+ except Exception as e:
62
+ return f"Error parsing results: {e}. Raw output: {result_str}"
63
+
64
+ async def audio_wrapper(url, voice_id):
65
+ if not url:
66
+ return "Please enter a URL.", None
67
+
68
+ gr.Info("Generating Audio... This may take a minute.")
69
+
70
+ # Note: medium_cast uses Edge-TTS by default (free), so we don't strictly need API keys
71
+ # unless falling back to ElevenLabs/OpenAI
72
+
73
+ try:
74
+ result = await medium_cast(url, voice_id)
75
+
76
+ # Check if result is a path (success)
77
+ if "Audio generated successfully" in result:
78
+ # Extract path
79
+ try:
80
+ path = result.split(": ")[1].strip()
81
+ # Remove any markdown formatting if present
82
+ path = path.replace("`", "")
83
+ if os.path.exists(path):
84
+ return result, path
85
+ else:
86
+ return f"{result} (File not found at {path})", None
87
+ except:
88
+ return result, None
89
+ return result, None
90
+ except Exception as e:
91
+ return f"Error: {str(e)}", None
92
+
93
+ async def synthesize_wrapper(topic):
94
+ if not topic:
95
+ return "Please enter a topic."
96
+
97
+ # Check for Gemini Key (Primary)
98
+ if not os.environ.get("GEMINI_API_KEY") and not os.environ.get("OPENAI_API_KEY"):
99
+ return "⚠️ Warning: No GEMINI_API_KEY or OPENAI_API_KEY found. Synthesis might fail or return mock data."
100
+
101
+ gr.Info(f"Synthesizing report for '{topic}'... This involves scraping multiple articles and may take 2-3 minutes.")
102
+ try:
103
+ return await medium_synthesize(topic)
104
+ except Exception as e:
105
+ return f"Error during synthesis: {str(e)}"
106
+
107
+ # Build UI
108
+ with gr.Blocks(title="Medium Agent", theme=gr.themes.Soft()) as demo:
109
+ gr.Markdown("# 📝 Medium Agent")
110
+ gr.Markdown("Search, Read, and Listen to Medium articles. Powered by MCP and Playwright.")
111
+
112
+ with gr.Tab("🔍 Search"):
113
+ gr.Markdown("### Search Medium Articles")
114
+ with gr.Row():
115
+ search_input = gr.Textbox(label="Query", placeholder="e.g. AI Agents", scale=4)
116
+ search_btn = gr.Button("Search", variant="primary", scale=1)
117
+ search_output = gr.HTML(label="Results")
118
+ search_btn.click(search_wrapper, inputs=search_input, outputs=search_output)
119
+ search_input.submit(search_wrapper, inputs=search_input, outputs=search_output)
120
+
121
+ with gr.Tab("🎧 Audio Article"):
122
+ gr.Markdown("### Convert Article to Audio")
123
+ gr.Markdown("Uses Edge-TTS (Free) by default. Falls back to ElevenLabs/OpenAI if configured.")
124
+ with gr.Row():
125
+ url_input = gr.Textbox(label="Article URL", placeholder="https://medium.com/...", scale=4)
126
+ audio_btn = gr.Button("Generate Audio", variant="primary", scale=1)
127
+
128
+ with gr.Accordion("Advanced Options", open=False):
129
+ voice_input = gr.Textbox(label="Voice ID (for ElevenLabs)", value="JBFqnCBsd6RMkjVDRZzb")
130
+
131
+ # Output status and audio player
132
+ audio_status = gr.Textbox(label="Status", interactive=False)
133
+ audio_player = gr.Audio(label="Play Audio", type="filepath")
134
+
135
+ audio_btn.click(audio_wrapper, inputs=[url_input, voice_input], outputs=[audio_status, audio_player])
136
+
137
+ with gr.Tab("🧠 Smart Synthesis"):
138
+ gr.Markdown("### Generate 'State of the Union' Report")
139
+ gr.Markdown("Scrapes top articles on a topic and uses Gemini/OpenAI to generate a comprehensive report.")
140
+ with gr.Row():
141
+ topic_input = gr.Textbox(label="Topic", placeholder="e.g. Generative AI", scale=4)
142
+ synth_btn = gr.Button("Synthesize", variant="primary", scale=1)
143
+ synth_output = gr.Markdown(label="Report")
144
+ synth_btn.click(synthesize_wrapper, inputs=topic_input, outputs=synth_output)
145
+ topic_input.submit(synthesize_wrapper, inputs=topic_input, outputs=synth_output)
146
+
147
+ if __name__ == "__main__":
148
+ # Launch with 0.0.0.0 for Docker/Cloud support
149
+ demo.launch(server_name="0.0.0.0", server_port=7860)
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ playwright>=1.40.0
2
+ beautifulsoup4>=4.12.0
3
+ markdownify>=0.11.6
4
+ httpx>=0.25.0
5
+ aiofiles>=23.2.1
6
+ google-generativeai>=0.3.0
7
+ openai>=1.3.0
8
+ edge-tts>=6.1.0
9
+ elevenlabs>=0.2.0
10
+ mcp>=0.9.0
11
+ fastmcp>=0.2.0
12
+ python-dotenv>=1.0.0
13
+ gradio>=4.0.0
server.py ADDED
@@ -0,0 +1,386 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+ import asyncio
4
+ import httpx
5
+ from typing import List, Optional
6
+ from elevenlabs.client import ElevenLabs
7
+ from openai import AsyncOpenAI
8
+ import google.generativeai as genai
9
+ import edge_tts
10
+
11
+ # Add sibling 'Medium-Scraper' directory to sys.path to access 'src'
12
+ project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../Medium-Scraper"))
13
+ if project_root not in sys.path:
14
+ sys.path.insert(0, project_root)
15
+
16
+ from mcp.server.fastmcp import FastMCP, Context, Image
17
+ from src.service import ScraperService
18
+
19
+ # Initialize FastMCP
20
+ mcp = FastMCP("Medium Scraper")
21
+
22
+ # Initialize Scraper Service (Worker Pool)
23
+ scraper = ScraperService(max_workers=5)
24
+
25
+ # --- Resources ---
26
+
27
+ @mcp.resource("medium://trending")
28
+ async def get_trending(ctx: Context = None) -> str:
29
+ """Returns the top trending articles on Medium."""
30
+ # We use the 'trending' tag as a proxy
31
+ if ctx:
32
+ await ctx.info("Fetching trending articles...")
33
+
34
+ results = await scraper.scrape_tag("trending", max_articles=10, progress_callback=ctx.info if ctx else None)
35
+ return str(results)
36
+
37
+ @mcp.resource("medium://tag/{tag}")
38
+ async def get_tag_feed(tag: str, ctx: Context = None) -> str:
39
+ """Returns the latest articles for a specific tag."""
40
+ if ctx:
41
+ await ctx.info(f"Fetching articles for tag: {tag}...")
42
+
43
+ results = await scraper.scrape_tag(tag, max_articles=10, progress_callback=ctx.info if ctx else None)
44
+ return str(results)
45
+
46
+ # --- Tools ---
47
+
48
+ @mcp.tool()
49
+ async def medium_search(query: str, ctx: Context = None) -> str:
50
+ """
51
+ Search Medium for articles.
52
+ Args:
53
+ query: The search query (e.g. "AI Agents", "Python Asyncio")
54
+ """
55
+ if ctx:
56
+ await ctx.info(f"Searching for: {query}...")
57
+
58
+ results = await scraper.scrape_search(query, progress_callback=ctx.info if ctx else None)
59
+ return str(results)
60
+
61
+ @mcp.tool()
62
+ async def medium_fresh(tag: str, ctx: Context = None) -> str:
63
+ """
64
+ Get the latest articles for a specific tag (Freshness).
65
+ Args:
66
+ tag: The topic tag (e.g. "Artificial Intelligence")
67
+ """
68
+ if ctx:
69
+ await ctx.info(f"Fetching fresh articles for: {tag}...")
70
+
71
+ results = await scraper.scrape_tag(tag, progress_callback=ctx.info if ctx else None)
72
+ return str(results)
73
+
74
+ @mcp.tool()
75
+ async def get_thumbnail(image_url: str, ctx: Context = None) -> Image:
76
+ """
77
+ Fetch an image from a URL and return it as an MCP Image object.
78
+ Args:
79
+ image_url: The URL of the image to fetch.
80
+ """
81
+ if ctx:
82
+ await ctx.info(f"Fetching image: {image_url}...")
83
+ async with httpx.AsyncClient() as client:
84
+ response = await client.get(image_url)
85
+ response.raise_for_status()
86
+ return Image(data=response.content, format="png")
87
+
88
+ @mcp.tool()
89
+ async def medium_cast(url: str, voice_id: str = "JBFqnCBsd6RMkjVDRZzb", ctx: Context = None) -> str:
90
+ """
91
+ Convert a Medium article into audio using ElevenLabs.
92
+ Args:
93
+ url: The URL of the article.
94
+ voice_id: The ElevenLabs voice ID to use (default: 'JBFqnCBsd6RMkjVDRZzb' - George).
95
+ """
96
+ api_key = os.environ.get("ELEVENLABS_API_KEY")
97
+ if not api_key:
98
+ return "Error: ELEVENLABS_API_KEY not set."
99
+
100
+ if ctx:
101
+ await ctx.info(f"Scraping article for audio: {url}...")
102
+
103
+ try:
104
+ article = await scraper.scrape_article(url)
105
+ if not article:
106
+ return "Error: Failed to scrape article (returned None)."
107
+
108
+ text = article.get("markdownContent", "")
109
+ title = article.get("title") or "Article"
110
+
111
+ author_data = article.get("author")
112
+ author = author_data.get("name") if (author_data and isinstance(author_data, dict)) else None
113
+ publication = article.get("publication") # New: get publication separately
114
+
115
+ if ctx:
116
+ await ctx.info("Scraping complete. Processing text...")
117
+
118
+ # Handle missing/blocked content
119
+ if not text or "Could not extract" in text or "Verify you are human" in text:
120
+ # Try to construct a fallback script
121
+ # If text starts with "Summary:", it means we got the meta description
122
+ description = ""
123
+ if text and text.startswith("Summary:"):
124
+ description = text.replace("Summary:", "").strip()
125
+
126
+ # Build attribution line
127
+ attribution = f"Title: {title}."
128
+ if author:
129
+ attribution += f" By {author}."
130
+ elif publication:
131
+ attribution += f" Published by {publication}."
132
+
133
+ text = attribution + " "
134
+
135
+ if description:
136
+ text += f"Here is a summary: {description}. "
137
+ text += "I could not retrieve the full text due to access restrictions, but I hope this summary is helpful."
138
+ else:
139
+ text += "I could not retrieve the full text of this article due to access restrictions, but I encourage you to read it on Medium."
140
+
141
+ # Final validation
142
+ if not text or len(text.strip()) < 10:
143
+ return "Error: No text available to generate audio."
144
+
145
+ # Truncate for TTS (save cost/time)
146
+ if len(text) > 2500:
147
+ text = text[:2500] + "... (end of preview)"
148
+
149
+ import uuid
150
+ output_filename = f"output_{uuid.uuid4().hex}.mp3"
151
+ output_path = os.path.join(os.path.dirname(__file__), output_filename)
152
+
153
+ if ctx:
154
+ await ctx.info(f"Text prepared ({len(text)} chars). Starting audio generation...")
155
+
156
+ # 1. Try Edge-TTS (Free, High Quality)
157
+ try:
158
+ if ctx:
159
+ await ctx.info("Generating audio with Edge-TTS (Free)...")
160
+
161
+ # Voice: en-US-ChristopherNeural (Male) or en-US-AriaNeural (Female)
162
+ communicate = edge_tts.Communicate(text, "en-US-ChristopherNeural")
163
+ await communicate.save(output_path)
164
+
165
+ # Verify file
166
+ if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
167
+ return f"Audio generated successfully (via Edge-TTS): {output_path}"
168
+ else:
169
+ raise Exception("Edge-TTS generated empty file.")
170
+
171
+ except Exception as e:
172
+ if ctx:
173
+ await ctx.info(f"Edge-TTS failed: {e}. Falling back to ElevenLabs...")
174
+
175
+ # 2. Fallback: ElevenLabs
176
+ api_key = os.environ.get("ELEVENLABS_API_KEY")
177
+ if api_key:
178
+ try:
179
+ if ctx:
180
+ await ctx.info("Generating audio with ElevenLabs...")
181
+
182
+ # Run blocking I/O in thread
183
+ def _run_elevenlabs():
184
+ client = ElevenLabs(api_key=api_key)
185
+ audio_generator = client.text_to_speech.convert(
186
+ text=text,
187
+ voice_id=voice_id,
188
+ model_id="eleven_multilingual_v2",
189
+ output_format="mp3_44100_128",
190
+ )
191
+ with open(output_path, "wb") as f:
192
+ for chunk in audio_generator:
193
+ f.write(chunk)
194
+
195
+ await asyncio.to_thread(_run_elevenlabs)
196
+
197
+ if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
198
+ return f"Audio generated successfully: {output_path}"
199
+ except Exception as e:
200
+ error_msg = str(e)
201
+ if "quota_exceeded" in error_msg:
202
+ if ctx:
203
+ await ctx.info("ElevenLabs quota exceeded. Falling back to OpenAI TTS...")
204
+ else:
205
+ if ctx:
206
+ await ctx.info(f"ElevenLabs failed: {e}. Falling back to OpenAI TTS...")
207
+
208
+ # 3. Fallback: OpenAI
209
+ openai_key = os.environ.get("OPENAI_API_KEY")
210
+ if not openai_key:
211
+ return "Error: Edge-TTS failed, ElevenLabs failed/missing, and OPENAI_API_KEY not set."
212
+
213
+ try:
214
+ if ctx:
215
+ await ctx.info("Generating audio with OpenAI TTS...")
216
+
217
+ client = AsyncOpenAI(api_key=openai_key)
218
+ response = await client.audio.speech.create(
219
+ model="tts-1",
220
+ voice="alloy",
221
+ input=text[:4096] # OpenAI limit
222
+ )
223
+
224
+ response.stream_to_file(output_path)
225
+ return f"Audio generated successfully (via OpenAI Fallback): {output_path}"
226
+ except Exception as e2:
227
+ if "insufficient_quota" in str(e2) or "429" in str(e2):
228
+ return "Error: All TTS services (Edge-TTS, ElevenLabs, OpenAI) failed or quotas exceeded."
229
+ return f"Error generating audio (All fallbacks failed): {str(e2)}"
230
+
231
+ except Exception as e:
232
+ return f"Error generating audio: {str(e)}"
233
+
234
+
235
+
236
+ @mcp.tool()
237
+ async def medium_synthesize(topic: str, ctx: Context = None) -> str:
238
+ """
239
+ Synthesize a 'State of the Union' report on a topic using top Medium articles.
240
+ Args:
241
+ topic: The topic to analyze (e.g. "Generative AI").
242
+ """
243
+ api_key = os.environ.get("OPENAI_API_KEY")
244
+ # We check for OpenAI key as a baseline, but Gemini key is checked inside
245
+ if not api_key and not os.environ.get("GEMINI_API_KEY"):
246
+ return "Error: Neither OPENAI_API_KEY nor GEMINI_API_KEY is set."
247
+
248
+ if ctx:
249
+ await ctx.info(f"Scraping top articles for: {topic}...")
250
+
251
+ # 1. Scrape Top Articles
252
+ articles = await scraper.scrape_search(topic, max_articles=5, progress_callback=ctx.info if ctx else None)
253
+
254
+ if not articles:
255
+ return "No articles found to synthesize."
256
+
257
+ # 2. Prepare Context for LLM (Parallel Scraping)
258
+ if ctx:
259
+ await ctx.info(f"Deep scraping {len(articles)} articles in parallel...")
260
+
261
+ async def _scrape_single_article(art):
262
+ url = art.get('url')
263
+ title = art.get('title')
264
+ author = art.get('author', {}).get('name')
265
+
266
+ content = ""
267
+ try:
268
+ full_art = await scraper.scrape_article(url)
269
+ content = full_art.get("markdownContent", "")
270
+ if "Could not extract" in content:
271
+ content = ""
272
+ except Exception:
273
+ content = ""
274
+
275
+ if not content:
276
+ content = f"Summary: {title} by {author}. (Full content unavailable)."
277
+
278
+ # Truncate
279
+ content = content[:2000] + "..." if len(content) > 2000 else content
280
+ return f"\nTitle: {title}\nURL: {url}\nAuthor: {author}\nContent:\n{content}\n"
281
+
282
+ # Run all scrapes concurrently
283
+ results = await asyncio.gather(*[_scrape_single_article(art) for art in articles])
284
+ context_text = "".join(results)
285
+
286
+ if ctx:
287
+ await ctx.info("Synthesizing insights...")
288
+
289
+ # 3. Call LLM (Gemini First)
290
+ gemini_key = os.environ.get("GEMINI_API_KEY")
291
+ report = ""
292
+
293
+ # Try Gemini
294
+ if gemini_key:
295
+ try:
296
+ if ctx:
297
+ await ctx.info("Synthesizing with Gemini (2.5-Flash)...")
298
+ genai.configure(api_key=gemini_key)
299
+ model = genai.GenerativeModel('gemini-2.5-flash')
300
+
301
+ prompt = f"""You are a tech analyst. Synthesize the following Medium articles into a 'State of the Union' report.
302
+ Highlight trends, key players, and sentiment.
303
+
304
+ Topic: {topic}
305
+
306
+ Articles:
307
+ {context_text}
308
+ """
309
+
310
+ # Gemini async generation
311
+ response = await model.generate_content_async(prompt)
312
+ report = response.text
313
+ return report
314
+ except Exception as e:
315
+ if ctx:
316
+ await ctx.info(f"Gemini failed: {e}. Falling back to OpenAI...")
317
+
318
+ # Fallback: OpenAI
319
+ if not api_key:
320
+ return f"Error: Gemini failed/missing and OPENAI_API_KEY not set. Gemini Error: {report if 'report' in locals() else 'N/A'}"
321
+
322
+ client = AsyncOpenAI(api_key=api_key)
323
+ try:
324
+ response = await client.chat.completions.create(
325
+ model="gpt-4o",
326
+ messages=[
327
+ {"role": "system", "content": "You are a tech analyst. Synthesize the following Medium articles into a 'State of the Union' report. Highlight trends, key players, and sentiment."},
328
+ {"role": "user", "content": f"Topic: {topic}\n\nArticles:\n{context_text}"}
329
+ ]
330
+ )
331
+ report = response.choices[0].message.content
332
+ except Exception as e:
333
+ if "insufficient_quota" in str(e) or "429" in str(e):
334
+ if ctx:
335
+ await ctx.info("OpenAI quota exceeded. Generating summary report locally...")
336
+
337
+ # Mock Fallback: Generate a report based on titles and authors
338
+ report = f"# State of the Union: {topic} (Generated Locally)\n\n"
339
+ report += "> **Note:** External AI services (Gemini & OpenAI) are currently unavailable or quota exceeded. This report is generated based on available metadata.\n\n"
340
+ report += "## Key Articles Analyzed\n"
341
+ for i, art in enumerate(articles):
342
+ report += f"- **{art.get('title')}** by {art.get('author', {}).get('name')}\n"
343
+
344
+ report += "\n## Summary\n"
345
+ report += f"Recent discussions on **{topic}** focus on the themes presented in the articles above. "
346
+ report += "Readers are encouraged to explore the full articles for in-depth analysis."
347
+ else:
348
+ report = f"Error generating report: {str(e)}"
349
+
350
+ return report
351
+
352
+ # --- Prompts ---
353
+
354
+ @mcp.prompt()
355
+ def summarize_article(url: str) -> str:
356
+ """Create a prompt to summarize a Medium article."""
357
+ return f"Please read and summarize the following Medium article: {url}\n\nFocus on the key takeaways and novel insights."
358
+
359
+ @mcp.prompt()
360
+ def tweet_thread(url: str) -> str:
361
+ """Create a prompt to turn an article into a Twitter thread."""
362
+ return f"Read this article: {url}\n\nConvert it into a viral 5-tweet thread. Use emojis and keep it punchy."
363
+
364
+ # --- Completions (Autocomplete) ---
365
+
366
+ COMMON_TAGS = [
367
+ "Artificial Intelligence", "Machine Learning", "Data Science", "Programming",
368
+ "Python", "JavaScript", "Startup", "Technology", "Writing", "Life Lessons",
369
+ "Productivity", "Design", "Marketing", "Business", "Health"
370
+ ]
371
+
372
+ @mcp.tool()
373
+ async def medium_search_with_autocomplete(
374
+ tag: str,
375
+ ctx: Context = None
376
+ ) -> str:
377
+ """
378
+ Search Medium with tag autocomplete support.
379
+ """
380
+ # Note: True autocomplete requires client-side support via the completion API,
381
+ # which FastMCP handles for Enums. For open strings, we provide this tool
382
+ # as a hint for future expansion.
383
+ return await medium_fresh(tag)
384
+
385
+ if __name__ == "__main__":
386
+ mcp.run()
src/__init__.py ADDED
File without changes
src/__main__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ import asyncio
2
+
3
+ from .main import main
4
+
5
+ # Execute the Actor entry point.
6
+ asyncio.run(main())
src/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (147 Bytes). View file
 
src/__pycache__/__main__.cpython-313.pyc ADDED
Binary file (269 Bytes). View file
 
src/__pycache__/config.cpython-313.pyc ADDED
Binary file (3.76 kB). View file
 
src/__pycache__/main.cpython-313.pyc ADDED
Binary file (9.89 kB). View file
 
src/__pycache__/parser.cpython-313.pyc ADDED
Binary file (10.6 kB). View file
 
src/__pycache__/service.cpython-313.pyc ADDED
Binary file (18.3 kB). View file
 
src/__pycache__/state.cpython-313.pyc ADDED
Binary file (4 kB). View file
 
src/__pycache__/utils.cpython-313.pyc ADDED
Binary file (683 Bytes). View file
 
src/config.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic import BaseModel, Field
2
+ from typing import Optional, List, Dict, Any
3
+ from apify import Actor
4
+
5
+ class ActorInput(BaseModel):
6
+ """
7
+ Defines the expected input schema for the Actor.
8
+ """
9
+ search_query: Optional[str] = Field(
10
+ default=None,
11
+ alias="searchQuery",
12
+ description="Search query to run on Medium."
13
+ )
14
+ tag: Optional[str] = Field(
15
+ default=None,
16
+ alias="tag",
17
+ description="Topic tag to scrape (e.g. 'AI Agents')."
18
+ )
19
+ start_urls: List[Dict[str, Any]] = Field(
20
+ default_factory=lambda: [{"url": "https://medium.com"}],
21
+ alias="start_urls",
22
+ description="List of start URLs."
23
+ )
24
+ max_requests_per_crawl: int = Field(
25
+ default=100,
26
+ alias="maxRequestsPerCrawl",
27
+ description="Maximum number of requests to process."
28
+ )
29
+ min_concurrency: int = Field(
30
+ default=1,
31
+ alias="minConcurrency",
32
+ description="Minimum number of parallel requests."
33
+ )
34
+ max_concurrency: int = Field(
35
+ default=10,
36
+ alias="maxConcurrency",
37
+ description="Maximum number of parallel requests."
38
+ )
39
+ max_request_retries: int = Field(
40
+ default=3,
41
+ alias="maxRequestRetries",
42
+ description="Number of retries for failed requests."
43
+ )
44
+ max_articles: int = Field(
45
+ default=5,
46
+ alias="maxArticles",
47
+ description="Maximum number of articles to scrape from search results."
48
+ )
49
+ scrape_full_content: bool = Field(
50
+ default=False,
51
+ alias="scrapeFullContent",
52
+ description="If True, visit article pages and extract full content."
53
+ )
54
+ enable_deduplication: bool = Field(
55
+ default=True,
56
+ alias="enableDeduplication",
57
+ description="If True, skip previously seen articles."
58
+ )
59
+ proxy_configuration: Optional[Dict[str, Any]] = Field(
60
+ default=None,
61
+ alias="proxyConfiguration",
62
+ description="Proxy configuration settings."
63
+ )
64
+
65
+ @classmethod
66
+ async def load(cls) -> "ActorInput":
67
+ """
68
+ Loads input from the Actor and validates it against the schema.
69
+ """
70
+ actor_input = await Actor.get_input()
71
+ if not actor_input:
72
+ import json
73
+ import os
74
+ from pathlib import Path
75
+ cwd = Path.cwd()
76
+ local_input = cwd / "local_input.json"
77
+ Actor.log.info(f"Checking for local input at: {local_input}")
78
+ if local_input.exists():
79
+ Actor.log.info(f"Loading input from local file: {local_input}")
80
+ actor_input = json.loads(local_input.read_text(encoding="utf-8"))
81
+ else:
82
+ Actor.log.warning(f"Local input not found at {local_input}")
83
+ actor_input = {}
84
+
85
+ Actor.log.info(f"Raw Actor Input: {actor_input}")
86
+ return cls(**actor_input)
src/main.py ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from bs4 import BeautifulSoup
2
+ from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
3
+ from crawlee import Request
4
+ from apify import Actor
5
+ import asyncio
6
+ from datetime import timedelta
7
+
8
+ from src.config import ActorInput
9
+ from src.parser import extract_search_results, extract_article_content
10
+ from src.utils import block_resources
11
+ from src.state import StateManager
12
+
13
+ async def main():
14
+ await Actor.init()
15
+
16
+ try:
17
+ # Load and validate input
18
+ actor_input = await ActorInput.load()
19
+ Actor.log.info(f"Loaded Input: {actor_input}")
20
+
21
+ # Initialize State Manager
22
+ state_manager = None
23
+ if actor_input.enable_deduplication:
24
+ state_manager = StateManager()
25
+ await state_manager.load_state()
26
+ else:
27
+ Actor.log.info("Deduplication disabled. Scraping all articles.")
28
+
29
+ # Build start URLs
30
+ start_urls = []
31
+ if actor_input.tag:
32
+ # Tag-based scraping (Freshness)
33
+ tag_slug = actor_input.tag.lower().replace(" ", "-")
34
+ start_urls.append(f"https://medium.com/tag/{tag_slug}/latest")
35
+ Actor.log.info(f"Targeting Tag: {actor_input.tag} (Latest)")
36
+ elif actor_input.search_query:
37
+ # Search-based scraping
38
+ q = actor_input.search_query.replace(" ", "+")
39
+ start_urls.append(f"https://medium.com/search?q={q}")
40
+
41
+ # Add explicit start URLs if provided
42
+ for u in actor_input.start_urls:
43
+ if u.get("url") and "medium.com" in u["url"]:
44
+ start_urls.append(u["url"])
45
+
46
+ if not start_urls:
47
+ Actor.log.info("No search query or valid start URLs provided. Exiting.")
48
+ await Actor.exit()
49
+ return
50
+
51
+ # Create proxy configuration
52
+ proxy_config = None
53
+ if actor_input.proxy_configuration:
54
+ proxy_config = await Actor.create_proxy_configuration(
55
+ actor_proxy_input=actor_input.proxy_configuration
56
+ )
57
+
58
+ crawler = PlaywrightCrawler(
59
+ proxy_configuration=proxy_config,
60
+ max_requests_per_crawl=actor_input.max_requests_per_crawl,
61
+ max_request_retries=actor_input.max_request_retries,
62
+ request_handler_timeout=timedelta(seconds=60),
63
+ )
64
+
65
+ @crawler.router.default_handler
66
+ async def handler(context: PlaywrightCrawlingContext):
67
+ url = context.request.url
68
+ Actor.log.info(f"Processing: {url}")
69
+
70
+ # Enable resource blocking
71
+ await context.page.route("**/*", block_resources)
72
+
73
+ # Wait for content
74
+ try:
75
+ Actor.log.info("Waiting for selectors...")
76
+ await context.page.wait_for_load_state("domcontentloaded")
77
+ await context.page.wait_for_selector("article, .postArticle, .js-block", timeout=10000)
78
+ Actor.log.info("Selectors found.")
79
+ except Exception as e:
80
+ Actor.log.warning(f"Timeout waiting for selectors on {url}: {e}")
81
+
82
+ # Parse Content
83
+ html = await context.page.content()
84
+ soup = BeautifulSoup(html, "html.parser")
85
+
86
+ # --- Router Logic ---
87
+
88
+ # 1. Article Page (Deep Scraping)
89
+ if context.request.label == "ARTICLE" or "/@/" in url or "-{12}" in url:
90
+ Actor.log.info(f"Scraping Article Content: {url}")
91
+
92
+ user_data = context.request.user_data
93
+ if not isinstance(user_data, dict):
94
+ user_data = {}
95
+
96
+ try:
97
+ loop = asyncio.get_running_loop()
98
+ content_data = await loop.run_in_executor(None, extract_article_content, soup)
99
+ Actor.log.info(f"Extracted content keys: {list(content_data.keys())}")
100
+ if content_data.get("markdownContent"):
101
+ Actor.log.info(f"Markdown length: {len(content_data['markdownContent'])}")
102
+ else:
103
+ Actor.log.warning("No markdown content extracted.")
104
+ except Exception as e:
105
+ Actor.log.error(f"Error extracting content: {e}")
106
+ content_data = {}
107
+
108
+ # Merge metadata
109
+ final_data = user_data.copy()
110
+ final_data.update({
111
+ "url": url,
112
+ "title": final_data.get("title") or (soup.title.string if soup.title else None),
113
+ **content_data
114
+ })
115
+
116
+ await context.push_data(final_data)
117
+
118
+ # 2. Search Page or Tag Page
119
+ elif "medium.com/search" in url or "/tag/" in url:
120
+ Actor.log.info(f"Scraping Listing Page: {url}")
121
+
122
+ loop = asyncio.get_running_loop()
123
+ results = await loop.run_in_executor(None, extract_search_results, soup, url)
124
+ Actor.log.info(f"Found {len(results)} articles.")
125
+
126
+ pushed = 0
127
+ for rec in results:
128
+ if pushed >= actor_input.max_articles:
129
+ break
130
+
131
+ full_url = rec["url"]
132
+
133
+ # Deduplication Check
134
+ if state_manager and state_manager.is_seen(full_url):
135
+ Actor.log.info(f"Skipping seen URL: {full_url}")
136
+ continue
137
+
138
+ # Add to state
139
+ if state_manager:
140
+ state_manager.add_seen(full_url)
141
+
142
+ if actor_input.scrape_full_content:
143
+ # Enqueue for deep scraping
144
+ await context.add_requests([Request.from_url(
145
+ url=full_url,
146
+ label="ARTICLE",
147
+ user_data={
148
+ "title": rec.get("title"),
149
+ "author": rec.get("author"),
150
+ "publishingDate": rec.get("publishingDate"),
151
+ "readingTime": rec.get("readingTime"),
152
+ "search_query": actor_input.search_query
153
+ }
154
+ )])
155
+ else:
156
+ # Fast mode
157
+ await context.push_data(rec)
158
+
159
+ pushed += 1
160
+
161
+ # Push search page summary
162
+ await context.push_data({
163
+ "type": "search_page",
164
+ "url": url,
165
+ "enqueued": pushed
166
+ })
167
+
168
+ Actor.log.info(f"Starting crawler with URLs: {start_urls}")
169
+ await crawler.run(start_urls)
170
+
171
+ except Exception as e:
172
+ Actor.log.error(f"Crawler failed: {e}")
173
+ raise
174
+ finally:
175
+ if state_manager:
176
+ await state_manager.save_state()
177
+ await Actor.exit()
178
+
179
+ if __name__ == "__main__":
180
+ asyncio.run(main())
src/parser.py ADDED
@@ -0,0 +1,309 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from bs4 import BeautifulSoup
2
+ from typing import Dict, List, Optional, Any
3
+ from markdownify import markdownify as md
4
+ from urllib.parse import urljoin
5
+
6
+ def extract_search_results(soup: BeautifulSoup, base_url: str) -> List[Dict[str, Any]]:
7
+ """
8
+ Extracts article metadata from search result cards.
9
+ """
10
+ results = []
11
+
12
+ # Selectors for article cards
13
+ # Try multiple selectors as Medium's DOM changes
14
+ cards = soup.select("article") or \
15
+ soup.select('div[role="article"]') or \
16
+ soup.select(".postArticle") or \
17
+ soup.select(".js-block")
18
+
19
+ for card in cards:
20
+ data = _extract_from_card(card, base_url)
21
+ if data.get("url"):
22
+ results.append(data)
23
+
24
+ return results
25
+
26
+ def _extract_from_card(card, base_url: str) -> Dict[str, Any]:
27
+ """Helper to extract data from a single card element."""
28
+ # 1. URL & Title
29
+ # Look for <a> tags that link to the article
30
+ # Usually the first <h2> inside an <a> is the title
31
+ title_tag = card.find("h2")
32
+ title = title_tag.get_text(strip=True) if title_tag else None
33
+
34
+ # Find the link associated with the title or the card
35
+ link_tag = card.find("a", href=True)
36
+ if title_tag and title_tag.find_parent("a"):
37
+ link_tag = title_tag.find_parent("a")
38
+
39
+ url = None
40
+ if link_tag:
41
+ href = link_tag["href"]
42
+ # Clean up URL (remove query params usually)
43
+ if "?" in href:
44
+ href = href.split("?")[0]
45
+ url = urljoin(base_url, href)
46
+
47
+ # 2. Author
48
+ # Heuristic: Look for links that go to a user profile (/@username or /u/username)
49
+ # but aren't the main article link.
50
+ author = None
51
+
52
+ # Try specific selectors first
53
+ author_tag = card.select_one('a[data-action="show-user-card"]') or \
54
+ card.select_one('.ds-link') or \
55
+ card.select_one('a[href*="/@"]')
56
+
57
+ if author_tag:
58
+ # Verify it's not the title link
59
+ if title_tag and author_tag == title_tag.find_parent("a"):
60
+ pass # It's the title
61
+ else:
62
+ author = author_tag.get_text(strip=True)
63
+
64
+ # Fallback: Look for a <p> or <span> that contains the author name
65
+ # Usually it's the first piece of text in the card meta area
66
+ if not author:
67
+ # Find the meta div (often has date/read time)
68
+ # We look for text that is NOT the date or read time
69
+ for p in card.find_all(["p", "span"]):
70
+ txt = p.get_text(strip=True)
71
+ # Skip empty, date-like, or read-time strings
72
+ if not txt or "min read" in txt or any(m in txt for m in ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec", "ago"]):
73
+ continue
74
+ # Skip title
75
+ if title and txt in title:
76
+ continue
77
+
78
+ # If it looks like a name (2-3 words, capitalized), take it
79
+ if 0 < len(txt.split()) <= 3 and txt[0].isupper():
80
+ author = txt
81
+ break
82
+
83
+ # 3. Date / Reading Time
84
+ # Often spans
85
+ spans = card.find_all("span")
86
+ pub_date = None
87
+ reading_time = None
88
+
89
+ for s in spans:
90
+ txt = s.get_text(strip=True)
91
+ # Reading time usually ends with "min read"
92
+ if "min read" in txt:
93
+ try:
94
+ reading_time = float(txt.replace("min read", "").strip())
95
+ except ValueError:
96
+ pass
97
+ # Date heuristic: "Nov 7" or "2 days ago"
98
+ # Hard to parse perfectly without regex, but we can grab it if it looks like a date
99
+ # For now, we might skip complex date parsing or just take the first span that isn't reading time
100
+ elif not pub_date and len(txt) < 15 and any(c.isdigit() for c in txt):
101
+ # Very rough heuristic
102
+ pub_date = txt
103
+
104
+ # 4. Image URL
105
+ # Priority:
106
+ # 1. <img src="..." class="..."/> inside the card (often has specific classes for covers)
107
+ # 2. First <img> tag in the card
108
+ # Note: Search results don't always have og:image tags (those are in the head), so we must rely on the card's HTML.
109
+ image_url = None
110
+
111
+ # Try to find the main article image (often has specific classes or sizes)
112
+ # Medium uses responsive images, often in <picture> or <img> with srcset.
113
+ # We'll look for the largest image or the first one that isn't an avatar.
114
+
115
+ images = card.find_all("img")
116
+ for img in images:
117
+ src = img.get("src", "")
118
+ # Skip small avatars (often 20x20 or similar in URL)
119
+ if "1*dmbNkD5D-u45r44go_cf0g.png" in src: # Common default avatar
120
+ continue
121
+ if "resize:fill:20:20" in src: # Tiny thumbnail
122
+ continue
123
+
124
+ # If it's a valid image, take it.
125
+ # Medium images often have 'cdn-images-1.medium.com'
126
+ if src:
127
+ image_url = src
128
+ break
129
+
130
+ if not image_url:
131
+ # Fallback to any img
132
+ img_tag = card.find("img")
133
+ if img_tag and img_tag.get("src"):
134
+ image_url = img_tag["src"]
135
+
136
+ return {
137
+ "url": url,
138
+ "title": title,
139
+ "author": {"name": author} if author else None,
140
+ "publishingDate": pub_date,
141
+ "readingTime": reading_time,
142
+ "imageUrl": image_url,
143
+ }
144
+
145
+ def extract_article_content(soup: BeautifulSoup, url: Optional[str] = None) -> Dict[str, Any]:
146
+ """
147
+ Extracts full content, claps, and responses from an article page.
148
+ If extraction fails (Cloudflare/paywall), falls back to URL parsing.
149
+ """
150
+ content_data = {
151
+ "markdownContent": None,
152
+ "claps": None,
153
+ "responses": None,
154
+ "title": None,
155
+ "author": None,
156
+ "publication": None # New field to track publication separately from author
157
+ }
158
+
159
+ # Extract Title (with fallbacks)
160
+ # Try h1 first
161
+ title_tag = soup.find("h1")
162
+ if title_tag:
163
+ content_data["title"] = title_tag.get_text(strip=True)
164
+
165
+ # Try og:title
166
+ if not content_data["title"]:
167
+ og_title = soup.find("meta", property="og:title")
168
+ if og_title and og_title.get("content"):
169
+ content_data["title"] = og_title.get("content")
170
+
171
+ # Try URL parsing if title is empty or generic (Cloudflare/Medium homepage)
172
+ is_generic_title = content_data["title"] in [None, "", "Just a moment...", "medium.com", "Medium"]
173
+ if is_generic_title and url:
174
+ # Medium URLs are like: https://medium.com/@author/article-title-slug-hash
175
+ # or https://medium.com/publication/article-title-slug-hash
176
+ try:
177
+ from urllib.parse import urlparse
178
+ path_parts = urlparse(url).path.strip("/").split("/")
179
+ if len(path_parts) >= 2:
180
+ # Last part is the article slug
181
+ article_slug = path_parts[-1]
182
+ # Remove hash (last part after last hyphen if it's alphanumeric)
183
+ slug_parts = article_slug.rsplit("-", 1)
184
+ if len(slug_parts) > 1 and len(slug_parts[-1]) == 12: # Medium hash is 12 chars
185
+ article_slug = slug_parts[0]
186
+ # Convert slug to title: replace-hyphens-with-spaces
187
+ title = article_slug.replace("-", " ").title()
188
+ content_data["title"] = title
189
+ except Exception:
190
+ pass
191
+
192
+ # Last resort: Try page title element (will be "medium.com" or "Just a moment..." for Cloudflare)
193
+ if not content_data["title"]:
194
+ title_elem = soup.find("title")
195
+ if title_elem:
196
+ page_title = title_elem.get_text(strip=True)
197
+ # Only use if it's not a Cloudflare/generic page
198
+ if page_title and page_title not in ["Just a moment...", "medium.com", "Medium"]:
199
+ content_data["title"] = page_title
200
+
201
+ # Extract Author
202
+ # Meta tag is reliable: <meta name="author" content="...">
203
+ meta_author = soup.find("meta", attrs={"name": "author"})
204
+ if meta_author and meta_author.get("content"):
205
+ content_data["author"] = {"name": meta_author.get("content")}
206
+ else:
207
+ # Fallback to selectors
208
+ author_tag = soup.select_one('a[data-action="show-user-card"]') or soup.select_one('.ds-link')
209
+ if author_tag:
210
+ author_text = author_tag.get_text(strip=True)
211
+ if author_text: # Only set if we got actual text
212
+ content_data["author"] = {"name": author_text}
213
+
214
+ # Extract publication or author from URL (metadata extraction)
215
+ if url:
216
+ try:
217
+ from urllib.parse import urlparse
218
+ path_parts = urlparse(url).path.strip("/").split("/")
219
+ if len(path_parts) >= 1:
220
+ first_part = path_parts[0]
221
+ # Check for @username format (personal blog)
222
+ if first_part.startswith("@"):
223
+ username = first_part[1:] # Remove @ symbol
224
+ formatted_name = username.replace("-", " ").title()
225
+ # If we don't have an author yet, use the username
226
+ if not content_data["author"]:
227
+ content_data["author"] = {"name": formatted_name}
228
+ # Otherwise it's a publication name (like "ai-in-plain-english")
229
+ else:
230
+ pub_name = first_part.replace("-", " ").title()
231
+ content_data["publication"] = pub_name
232
+ # Only use publication as author if we have absolutely no author info
233
+ # (Note: This is not ideal but better than nothing for blocked pages)
234
+ except Exception:
235
+ pass
236
+
237
+ # Pre-extract og:description for fallback (before attempting main extraction)
238
+ og_description = soup.find("meta", property="og:description")
239
+ fallback_description = og_description.get("content") if og_description else None
240
+
241
+ # Extract Claps
242
+ try:
243
+ clap_el = soup.select_one('button[data-testid="clapCount"]') or soup.select_one('.clapCount')
244
+ if clap_el:
245
+ txt = clap_el.get_text(strip=True)
246
+ if "K" in txt:
247
+ content_data["claps"] = int(float(txt.replace("K", "")) * 1000)
248
+ else:
249
+ content_data["claps"] = int(txt)
250
+ except Exception:
251
+ pass
252
+
253
+ # Extract Responses
254
+ try:
255
+ resp_el = soup.select_one('button[data-testid="responsesCount"]') or soup.select_one('.responsesCount')
256
+ if resp_el:
257
+ txt = resp_el.get_text(strip=True)
258
+ content_data["responses"] = int(txt)
259
+ except Exception:
260
+ pass
261
+
262
+ # Extract Content
263
+ article = soup.find("article") or soup.find("section")
264
+ if article:
265
+ # Remove clutter
266
+ for tag in article.select("button, .speechify-btn, .metabar, footer"):
267
+ tag.decompose()
268
+
269
+ html = str(article)
270
+ content_data["markdownContent"] = md(html, heading_style="ATX")
271
+
272
+ # Fallback 1: Try to extract first few paragraphs (intro/header) even if article tag failed
273
+ if not content_data["markdownContent"] or len(content_data["markdownContent"]) < 100:
274
+ # Look for any paragraphs in the page (might be intro text that loaded before paywall)
275
+ paragraphs = soup.find_all("p")
276
+ if paragraphs:
277
+ # Get first 3-5 paragraphs that have substantial content
278
+ intro_text = []
279
+ for p in paragraphs[:10]: # Check first 10 paragraphs
280
+ text = p.get_text(strip=True)
281
+ # Skip short paragraphs (likely meta info) and certain patterns
282
+ if len(text) > 50 and "min read" not in text.lower() and "ago" not in text:
283
+ intro_text.append(text)
284
+ if len(intro_text) >= 3: # Got enough intro paragraphs
285
+ break
286
+
287
+ if intro_text:
288
+ combined_intro = "\n\n".join(intro_text)
289
+ if not content_data["markdownContent"]:
290
+ content_data["markdownContent"] = combined_intro
291
+ else:
292
+ # Append intro to existing content if it was too short
293
+ content_data["markdownContent"] += "\n\n" + combined_intro
294
+
295
+ # Fallback 2: Meta Description (if still no content)
296
+ if not content_data["markdownContent"] or len(content_data["markdownContent"]) < 50:
297
+ if fallback_description:
298
+ desc_text = f"Summary: {fallback_description}"
299
+ if content_data["markdownContent"]:
300
+ content_data["markdownContent"] = desc_text + "\n\n" + content_data["markdownContent"]
301
+ else:
302
+ content_data["markdownContent"] = desc_text
303
+ else:
304
+ # Last resort: try name="description"
305
+ meta_desc = soup.find("meta", attrs={"name": "description"})
306
+ if meta_desc:
307
+ content_data["markdownContent"] = f"Summary: {meta_desc.get('content', '')}"
308
+
309
+ return content_data
src/py.typed ADDED
File without changes
src/service.py ADDED
@@ -0,0 +1,279 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ from playwright.async_api import async_playwright, Browser, BrowserContext, Page
3
+ from bs4 import BeautifulSoup
4
+ from typing import List, Dict, Any, Optional, Callable, Awaitable
5
+ import logging
6
+
7
+ # Reuse existing logic
8
+ from src.parser import extract_search_results, extract_article_content
9
+ from src.utils import block_resources
10
+
11
+ # Configure logging
12
+ logging.basicConfig(level=logging.INFO)
13
+ logger = logging.getLogger("ScraperService")
14
+
15
+ class ScraperWorker:
16
+ """
17
+ A single worker that manages its own BrowserContext.
18
+ """
19
+ def __init__(self, worker_id: int, browser: Browser):
20
+ self.worker_id = worker_id
21
+ self.browser = browser
22
+ self.context: Optional[BrowserContext] = None
23
+ self._lock = asyncio.Lock()
24
+
25
+ async def ensure_context(self):
26
+ """Ensures this worker has an open context."""
27
+ async with self._lock:
28
+ if not self.context:
29
+ logger.info(f"[Worker {self.worker_id}] Creating context...")
30
+ self.context = await self.browser.new_context()
31
+ # Optional: Block resources globally for this context if possible,
32
+ # but route is usually per-page.
33
+
34
+ async def scrape_search(self, query: str, max_articles: int, progress_callback: Optional[Callable[[str], Awaitable[None]]] = None) -> List[Dict[str, Any]]:
35
+ """Scrapes search results using this worker's context."""
36
+ if progress_callback:
37
+ await progress_callback(f"[Worker {self.worker_id}] Starting search for '{query}'...")
38
+
39
+ await self.ensure_context()
40
+ page = await self.context.new_page()
41
+
42
+ try:
43
+ # Block resources
44
+ # await page.route("**/*", block_resources)
45
+
46
+ # Set User Agent
47
+ await page.set_extra_http_headers({
48
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
49
+ })
50
+
51
+ url = f"https://medium.com/search?q={query.replace(' ', '+')}"
52
+ logger.info(f"[Worker {self.worker_id}] Navigating to: {url}")
53
+ if progress_callback:
54
+ await progress_callback(f"[Worker {self.worker_id}] Navigating to {url}...")
55
+ await page.goto(url, wait_until="domcontentloaded")
56
+
57
+ try:
58
+ await page.wait_for_selector("article, div[role='article'], .postArticle, .js-block", timeout=30000)
59
+ except Exception as e:
60
+ logger.warning(f"[Worker {self.worker_id}] Timeout waiting for selectors: {e}")
61
+
62
+ html = await page.content()
63
+ soup = BeautifulSoup(html, "html.parser")
64
+
65
+ results = extract_search_results(soup, url)
66
+ return results[:max_articles]
67
+
68
+ finally:
69
+ await page.close()
70
+
71
+ async def scrape_tag(self, tag: str, max_articles: int, progress_callback: Optional[Callable[[str], Awaitable[None]]] = None) -> List[Dict[str, Any]]:
72
+ """Scrapes tag results using this worker's context."""
73
+ if progress_callback:
74
+ await progress_callback(f"[Worker {self.worker_id}] Starting tag scrape for '{tag}'...")
75
+
76
+ await self.ensure_context()
77
+ page = await self.context.new_page()
78
+
79
+ try:
80
+ # Set User Agent
81
+ await page.set_extra_http_headers({
82
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
83
+ })
84
+
85
+ tag_slug = tag.lower().replace(" ", "-")
86
+ url = f"https://medium.com/tag/{tag_slug}"
87
+ logger.info(f"[Worker {self.worker_id}] Navigating to: {url}")
88
+ if progress_callback:
89
+ await progress_callback(f"[Worker {self.worker_id}] Navigating to {url}...")
90
+ await page.goto(url, wait_until="domcontentloaded")
91
+
92
+ try:
93
+ await page.wait_for_selector("article, div[role='article'], .postArticle, .js-block", timeout=30000)
94
+ except Exception as e:
95
+ logger.warning(f"[Worker {self.worker_id}] Timeout waiting for selectors: {e}")
96
+
97
+ html = await page.content()
98
+ soup = BeautifulSoup(html, "html.parser")
99
+
100
+ results = extract_search_results(soup, url)
101
+ return results[:max_articles]
102
+
103
+ finally:
104
+ await page.close()
105
+
106
+ async def close(self):
107
+ if self.context:
108
+ await self.context.close()
109
+
110
+ async def scrape_article(self, url: str) -> Dict[str, Any]:
111
+ """Scrapes full article content using this worker's context."""
112
+ await self.ensure_context()
113
+ page = await self.context.new_page()
114
+
115
+ try:
116
+ # Stealth: Remove webdriver property
117
+ await page.add_init_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
118
+
119
+ # Set standard User Agent
120
+ await page.set_extra_http_headers({
121
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
122
+ })
123
+
124
+ logger.info(f"[Worker {self.worker_id}] Navigating to article: {url}")
125
+
126
+ # Anti-bot: Go to homepage first to set cookies
127
+ await page.goto("https://medium.com/", wait_until="domcontentloaded")
128
+ await page.wait_for_timeout(1500)
129
+
130
+ # Navigate to article
131
+ await page.goto(url, wait_until="domcontentloaded", timeout=20000)
132
+
133
+ # Wait longer for dynamic content to load
134
+ await page.wait_for_timeout(3000)
135
+
136
+ try:
137
+ # Wait for the main article content
138
+ # Try multiple selectors
139
+ await page.wait_for_selector("article, section, main, div[role='main'], h1", timeout=20000)
140
+
141
+ # Scroll to bottom to trigger lazy loading
142
+ for i in range(5):
143
+ await page.evaluate("window.scrollBy(0, window.innerHeight)")
144
+ await page.wait_for_timeout(500)
145
+
146
+ # Wait a bit more after scrolling
147
+ await page.wait_for_timeout(2000)
148
+
149
+ except Exception as e:
150
+ logger.warning(f"[Worker {self.worker_id}] Timeout waiting for article selectors: {e}")
151
+
152
+ # Fallback: Try Google Cache (Text Only)
153
+ try:
154
+ logger.info(f"[Worker {self.worker_id}] Trying Google Cache for: {url}")
155
+ cache_url = f"http://webcache.googleusercontent.com/search?q=cache:{url}&strip=1"
156
+ await page.goto(cache_url, wait_until="domcontentloaded", timeout=15000)
157
+
158
+ # Google Cache (Text Only) usually puts content in <pre> or just body
159
+ # We'll let the standard extractor try, or just grab body
160
+ await page.wait_for_selector("body", timeout=5000)
161
+
162
+ except Exception as e2:
163
+ logger.warning(f"[Worker {self.worker_id}] Google Cache failed: {e2}")
164
+
165
+ html = await page.content()
166
+ soup = BeautifulSoup(html, "html.parser")
167
+
168
+ content = extract_article_content(soup, url=url) # Pass URL for fallback parsing
169
+ content["html_debug"] = html # For debugging
170
+
171
+ # Fallback if markdown is empty
172
+ if not content.get("markdownContent"):
173
+ # Try to just get text from body as a last resort
174
+ body = soup.find("body")
175
+ if body:
176
+ text = body.get_text(separator="\n", strip=True)
177
+ # Clean up a bit
178
+ if len(text) > 500:
179
+ content["markdownContent"] = text[:5000] # Limit fallback text
180
+ else:
181
+ content["markdownContent"] = "Could not extract article content. It might be behind a paywall or login."
182
+
183
+ return content
184
+
185
+ finally:
186
+ await page.close()
187
+
188
+
189
+ class ScraperService:
190
+ """
191
+ Manages a pool of ScraperWorkers for concurrent scraping.
192
+ """
193
+ def __init__(self, max_workers: int = 5, headless: bool = True):
194
+ self.max_workers = max_workers
195
+ self.headless = headless
196
+ self.playwright = None
197
+ self.browser: Optional[Browser] = None
198
+ self.workers: List[ScraperWorker] = []
199
+ self.worker_queue = asyncio.Queue()
200
+ self._initialized = False
201
+ self._lock = asyncio.Lock()
202
+
203
+ async def ensure_initialized(self):
204
+ """Starts Playwright, Browser, and Workers."""
205
+ async with self._lock:
206
+ # Check if browser is alive
207
+ if self.browser and not self.browser.is_connected():
208
+ logger.warning("Browser is disconnected. Restarting...")
209
+ await self.close()
210
+ self._initialized = False
211
+ self.workers = []
212
+ self.worker_queue = asyncio.Queue()
213
+
214
+ if self._initialized:
215
+ return
216
+
217
+ logger.info("Initializing Scraper Service...")
218
+ self.playwright = await async_playwright().start()
219
+ self.browser = await self.playwright.chromium.launch(headless=self.headless)
220
+
221
+ # Create Workers
222
+ self.workers = [] # Reset workers
223
+ for i in range(self.max_workers):
224
+ worker = ScraperWorker(i, self.browser)
225
+ self.workers.append(worker)
226
+ await self.worker_queue.put(worker)
227
+
228
+ self._initialized = True
229
+ logger.info(f"Initialized {self.max_workers} workers.")
230
+
231
+ async def _get_worker(self) -> ScraperWorker:
232
+ """Retrieves a free worker from the queue."""
233
+ # Check connection before getting worker
234
+ if self.browser and not self.browser.is_connected():
235
+ logger.warning("Browser disconnected in _get_worker. Re-initializing...")
236
+ await self.ensure_initialized()
237
+
238
+ await self.ensure_initialized()
239
+ return await self.worker_queue.get()
240
+
241
+ async def _release_worker(self, worker: ScraperWorker):
242
+ """Returns a worker to the queue."""
243
+ await self.worker_queue.put(worker)
244
+
245
+ async def scrape_search(self, query: str, max_articles: int = 5, progress_callback: Optional[Callable[[str], Awaitable[None]]] = None) -> List[Dict[str, Any]]:
246
+ """Delegates search to a free worker."""
247
+ worker = await self._get_worker()
248
+ try:
249
+ return await worker.scrape_search(query, max_articles, progress_callback)
250
+ finally:
251
+ await self._release_worker(worker)
252
+
253
+ async def scrape_tag(self, tag: str, max_articles: int = 5, progress_callback: Optional[Callable[[str], Awaitable[None]]] = None) -> List[Dict[str, Any]]:
254
+ """Delegates tag scrape to a free worker."""
255
+ worker = await self._get_worker()
256
+ try:
257
+ return await worker.scrape_tag(tag, max_articles, progress_callback)
258
+ finally:
259
+ await self._release_worker(worker)
260
+
261
+ async def close(self):
262
+ """Closes all workers and the browser."""
263
+ logger.info("Closing ScraperService...")
264
+ for worker in self.workers:
265
+ await worker.close()
266
+
267
+ if self.browser:
268
+ await self.browser.close()
269
+ if self.playwright:
270
+ await self.playwright.stop()
271
+
272
+ async def scrape_article(self, url: str) -> Dict[str, Any]:
273
+ """Delegates article scrape to a free worker."""
274
+ worker = await self._get_worker()
275
+ try:
276
+ return await worker.scrape_article(url)
277
+ finally:
278
+ await self._release_worker(worker)
279
+
src/state.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from apify import Actor
2
+ from typing import Set
3
+
4
+ class StateManager:
5
+ """
6
+ Manages the persistent state of the actor, specifically for deduplication.
7
+ """
8
+ def __init__(self):
9
+ self.seen_urls: Set[str] = set()
10
+
11
+ async def load_state(self):
12
+ """
13
+ Loads the state from the default key-value store.
14
+ """
15
+ state = await Actor.get_value("STATE")
16
+
17
+ if not state:
18
+ # Fallback for local development
19
+ import json
20
+ from pathlib import Path
21
+ cwd = Path.cwd()
22
+ local_state = cwd / "local_state.json"
23
+ Actor.log.info(f"Checking for local state at: {local_state}")
24
+ if local_state.exists():
25
+ try:
26
+ state = json.loads(local_state.read_text(encoding="utf-8"))
27
+ Actor.log.info(f"Loaded state from local file: {local_state}")
28
+ except Exception as e:
29
+ Actor.log.warning(f"Failed to load local state: {e}")
30
+ state = {}
31
+ else:
32
+ state = {}
33
+
34
+ self.seen_urls = set(state.get("seen_urls", []))
35
+ Actor.log.info(f"Loaded state: {len(self.seen_urls)} seen URLs.")
36
+
37
+ async def save_state(self):
38
+ """
39
+ Saves the current state to the default key-value store.
40
+ """
41
+ state = {
42
+ "seen_urls": list(self.seen_urls)
43
+ }
44
+ await Actor.set_value("STATE", state)
45
+
46
+ # Backup for local development
47
+ import json
48
+ from pathlib import Path
49
+ cwd = Path.cwd()
50
+ local_state = cwd / "local_state.json"
51
+ try:
52
+ local_state.write_text(json.dumps(state, indent=2), encoding="utf-8")
53
+ Actor.log.info(f"Backed up state to local file: {local_state}")
54
+ except Exception as e:
55
+ Actor.log.warning(f"Failed to backup local state: {e}")
56
+
57
+ Actor.log.info(f"Saved state: {len(self.seen_urls)} seen URLs.")
58
+
59
+ def is_seen(self, url: str) -> bool:
60
+ """
61
+ Checks if a URL has already been seen.
62
+ """
63
+ return url in self.seen_urls
64
+
65
+ def add_seen(self, url: str):
66
+ """
67
+ Adds a URL to the set of seen URLs.
68
+ """
69
+ self.seen_urls.add(url)
src/utils.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ from apify import Actor
2
+
3
+ async def block_resources(route):
4
+ """
5
+ Blocks unnecessary resources to speed up scraping.
6
+ """
7
+ if route.request.resource_type in ["image", "stylesheet", "font", "media"]:
8
+ await route.abort()
9
+ else:
10
+ await route.continue_()