Spaces:

T0X1N
/

Medium-MCP

Sleeping

App Files Files Community

Nikhil Pravin Pise commited on Nov 29, 2025

Commit

a80eeb8

1 Parent(s): 81035fa

Initial deploy

Browse files

Files changed (23) hide show

.env +4 -0
Dockerfile +33 -0
README.md +8 -7
app.py +149 -0
requirements.txt +13 -0
server.py +386 -0
src/__init__.py +0 -0
src/__main__.py +6 -0
src/__pycache__/__init__.cpython-313.pyc +0 -0
src/__pycache__/__main__.cpython-313.pyc +0 -0
src/__pycache__/config.cpython-313.pyc +0 -0
src/__pycache__/main.cpython-313.pyc +0 -0
src/__pycache__/parser.cpython-313.pyc +0 -0
src/__pycache__/service.cpython-313.pyc +0 -0
src/__pycache__/state.cpython-313.pyc +0 -0
src/__pycache__/utils.cpython-313.pyc +0 -0
src/config.py +86 -0
src/main.py +180 -0
src/parser.py +309 -0
src/py.typed +0 -0
src/service.py +279 -0
src/state.py +69 -0
src/utils.py +10 -0

.env ADDED Viewed

	@@ -0,0 +1,4 @@

+# Add your API keys in Hugging Face Space Settings (Secrets)
+# GEMINI_API_KEY=
+# OPENAI_API_KEY=
+# ELEVENLABS_API_KEY=

Dockerfile ADDED Viewed

	@@ -0,0 +1,33 @@

+FROM python:3.10-slim
+# Set working directory
+WORKDIR /app
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    git \
+    wget \
+    gnupg \
+    && rm -rf /var/lib/apt/lists/*
+# Copy requirements first to leverage cache
+COPY requirements.txt .
+# Install Python dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+# Install Playwright and browsers
+RUN playwright install --with-deps chromium
+# Copy the rest of the application
+COPY . .
+# Set environment variables
+ENV PYTHONPATH=/app
+ENV PYTHONUNBUFFERED=1
+# Expose port 7860 for Gradio
+EXPOSE 7860
+# Run the application
+CMD ["python", "app.py"]

README.md CHANGED Viewed

@@ -1,12 +1,13 @@
 ---
-title: Medium MCP
-emoji: 😻
-colorFrom: purple
-colorTo: indigo
 sdk: docker
 pinned: false
-license: mit
-short_description: A MCP Server with a Scraper built in
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Medium Agent
+emoji: 📝
+colorFrom: gray
+colorTo: black
 sdk: docker
 pinned: false
+app_port: 7860
 ---
+# Medium Agent
+A powerful Medium article scraper and audio generator.

app.py ADDED Viewed

	@@ -0,0 +1,149 @@

+import gradio as gr
+import asyncio
+import os
+import sys
+import ast
+from dotenv import load_dotenv
+# Load environment variables
+load_dotenv()
+# Import tools from server
+# We assume server.py is in the same directory
+try:
+    from server import medium_search, medium_cast, medium_synthesize
+except ImportError:
+    # If running locally with different structure, try to adjust path
+    sys.path.append(os.path.dirname(__file__))
+    from server import medium_search, medium_cast, medium_synthesize
+async def search_wrapper(query):
+    if not query:
+        return "Please enter a query."
+    gr.Info(f"Searching for '{query}'...")
+    try:
+        # Get string result from tool
+        result_str = await medium_search(query)
+        # Parse string back to list
+        results = ast.literal_eval(result_str)
+        if not results:
+            return "No results found."
+        html = "<div style='display: grid; grid-template-columns: repeat(auto-fill, minmax(300px, 1fr)); gap: 20px;'>"
+        for art in results:
+            title = art.get('title', 'No Title')
+            url = art.get('url', '#')
+            author = art.get('author', {}).get('name', 'Unknown') if art.get('author') else 'Unknown'
+            publication = art.get('publication', '')
+            if publication and author == 'Unknown':
+                author = publication
+            img = art.get('imageUrl', '')
+            # Fallback image if empty
+            if not img:
+                img = "https://miro.medium.com/max/1400/1*jfdwtvU6V6g99q3G7gq7dQ.png"
+            html += f"""
+            <div style='border: 1px solid #ddd; border-radius: 8px; overflow: hidden; padding: 0; background: #2b2b2b; color: #fff; display: flex; flex-direction: column;'>
+                <div style='height: 160px; background-image: url("{img}"); background-size: cover; background-position: center;'></div>
+                <div style='padding: 15px; flex-grow: 1;'>
+                    <h3 style='margin: 0 0 10px 0; font-size: 16px; line-height: 1.4;'><a href='{url}' target='_blank' style='color: #fff; text-decoration: none;'>{title}</a></h3>
+                    <p style='margin: 0; font-size: 12px; color: #aaa;'>By {author}</p>
+                </div>
+            </div>
+            """
+        html += "</div>"
+        return html
+    except Exception as e:
+        return f"Error parsing results: {e}. Raw output: {result_str}"
+async def audio_wrapper(url, voice_id):
+    if not url:
+        return "Please enter a URL.", None
+    gr.Info("Generating Audio... This may take a minute.")
+    # Note: medium_cast uses Edge-TTS by default (free), so we don't strictly need API keys
+    # unless falling back to ElevenLabs/OpenAI
+    try:
+        result = await medium_cast(url, voice_id)
+        # Check if result is a path (success)
+        if "Audio generated successfully" in result:
+            # Extract path
+            try:
+                path = result.split(": ")[1].strip()
+                # Remove any markdown formatting if present
+                path = path.replace("`", "")
+                if os.path.exists(path):
+                    return result, path
+                else:
+                    return f"{result} (File not found at {path})", None
+            except:
+                return result, None
+        return result, None
+    except Exception as e:
+        return f"Error: {str(e)}", None
+async def synthesize_wrapper(topic):
+    if not topic:
+        return "Please enter a topic."
+    # Check for Gemini Key (Primary)
+    if not os.environ.get("GEMINI_API_KEY") and not os.environ.get("OPENAI_API_KEY"):
+        return "⚠️ Warning: No GEMINI_API_KEY or OPENAI_API_KEY found. Synthesis might fail or return mock data."
+    gr.Info(f"Synthesizing report for '{topic}'... This involves scraping multiple articles and may take 2-3 minutes.")
+    try:
+        return await medium_synthesize(topic)
+    except Exception as e:
+        return f"Error during synthesis: {str(e)}"
+# Build UI
+with gr.Blocks(title="Medium Agent", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# 📝 Medium Agent")
+    gr.Markdown("Search, Read, and Listen to Medium articles. Powered by MCP and Playwright.")
+    with gr.Tab("🔍 Search"):
+        gr.Markdown("### Search Medium Articles")
+        with gr.Row():
+            search_input = gr.Textbox(label="Query", placeholder="e.g. AI Agents", scale=4)
+            search_btn = gr.Button("Search", variant="primary", scale=1)
+        search_output = gr.HTML(label="Results")
+        search_btn.click(search_wrapper, inputs=search_input, outputs=search_output)
+        search_input.submit(search_wrapper, inputs=search_input, outputs=search_output)
+    with gr.Tab("🎧 Audio Article"):
+        gr.Markdown("### Convert Article to Audio")
+        gr.Markdown("Uses Edge-TTS (Free) by default. Falls back to ElevenLabs/OpenAI if configured.")
+        with gr.Row():
+            url_input = gr.Textbox(label="Article URL", placeholder="https://medium.com/...", scale=4)
+            audio_btn = gr.Button("Generate Audio", variant="primary", scale=1)
+        with gr.Accordion("Advanced Options", open=False):
+            voice_input = gr.Textbox(label="Voice ID (for ElevenLabs)", value="JBFqnCBsd6RMkjVDRZzb")
+        # Output status and audio player
+        audio_status = gr.Textbox(label="Status", interactive=False)
+        audio_player = gr.Audio(label="Play Audio", type="filepath")
+        audio_btn.click(audio_wrapper, inputs=[url_input, voice_input], outputs=[audio_status, audio_player])
+    with gr.Tab("🧠 Smart Synthesis"):
+        gr.Markdown("### Generate 'State of the Union' Report")
+        gr.Markdown("Scrapes top articles on a topic and uses Gemini/OpenAI to generate a comprehensive report.")
+        with gr.Row():
+            topic_input = gr.Textbox(label="Topic", placeholder="e.g. Generative AI", scale=4)
+            synth_btn = gr.Button("Synthesize", variant="primary", scale=1)
+        synth_output = gr.Markdown(label="Report")
+        synth_btn.click(synthesize_wrapper, inputs=topic_input, outputs=synth_output)
+        topic_input.submit(synthesize_wrapper, inputs=topic_input, outputs=synth_output)
+if __name__ == "__main__":
+    # Launch with 0.0.0.0 for Docker/Cloud support
+    demo.launch(server_name="0.0.0.0", server_port=7860)

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+playwright>=1.40.0
+beautifulsoup4>=4.12.0
+markdownify>=0.11.6
+httpx>=0.25.0
+aiofiles>=23.2.1
+google-generativeai>=0.3.0
+openai>=1.3.0
+edge-tts>=6.1.0
+elevenlabs>=0.2.0
+mcp>=0.9.0
+fastmcp>=0.2.0
+python-dotenv>=1.0.0
+gradio>=4.0.0

server.py ADDED Viewed

	@@ -0,0 +1,386 @@

+import sys
+import os
+import asyncio
+import httpx
+from typing import List, Optional
+from elevenlabs.client import ElevenLabs
+from openai import AsyncOpenAI
+import google.generativeai as genai
+import edge_tts
+# Add sibling 'Medium-Scraper' directory to sys.path to access 'src'
+project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../Medium-Scraper"))
+if project_root not in sys.path:
+    sys.path.insert(0, project_root)
+from mcp.server.fastmcp import FastMCP, Context, Image
+from src.service import ScraperService
+# Initialize FastMCP
+mcp = FastMCP("Medium Scraper")
+# Initialize Scraper Service (Worker Pool)
+scraper = ScraperService(max_workers=5)
+# --- Resources ---
+@mcp.resource("medium://trending")
+async def get_trending(ctx: Context = None) -> str:
+    """Returns the top trending articles on Medium."""
+    # We use the 'trending' tag as a proxy
+    if ctx:
+        await ctx.info("Fetching trending articles...")
+    results = await scraper.scrape_tag("trending", max_articles=10, progress_callback=ctx.info if ctx else None)
+    return str(results)
+@mcp.resource("medium://tag/{tag}")
+async def get_tag_feed(tag: str, ctx: Context = None) -> str:
+    """Returns the latest articles for a specific tag."""
+    if ctx:
+        await ctx.info(f"Fetching articles for tag: {tag}...")
+    results = await scraper.scrape_tag(tag, max_articles=10, progress_callback=ctx.info if ctx else None)
+    return str(results)
+# --- Tools ---
+@mcp.tool()
+async def medium_search(query: str, ctx: Context = None) -> str:
+    """
+    Search Medium for articles.
+    Args:
+        query: The search query (e.g. "AI Agents", "Python Asyncio")
+    """
+    if ctx:
+        await ctx.info(f"Searching for: {query}...")
+    results = await scraper.scrape_search(query, progress_callback=ctx.info if ctx else None)
+    return str(results)
+@mcp.tool()
+async def medium_fresh(tag: str, ctx: Context = None) -> str:
+    """
+    Get the latest articles for a specific tag (Freshness).
+    Args:
+        tag: The topic tag (e.g. "Artificial Intelligence")
+    """
+    if ctx:
+        await ctx.info(f"Fetching fresh articles for: {tag}...")
+    results = await scraper.scrape_tag(tag, progress_callback=ctx.info if ctx else None)
+    return str(results)
+@mcp.tool()
+async def get_thumbnail(image_url: str, ctx: Context = None) -> Image:
+    """
+    Fetch an image from a URL and return it as an MCP Image object.
+    Args:
+        image_url: The URL of the image to fetch.
+    """
+    if ctx:
+        await ctx.info(f"Fetching image: {image_url}...")
+    async with httpx.AsyncClient() as client:
+        response = await client.get(image_url)
+        response.raise_for_status()
+        return Image(data=response.content, format="png")
+@mcp.tool()
+async def medium_cast(url: str, voice_id: str = "JBFqnCBsd6RMkjVDRZzb", ctx: Context = None) -> str:
+    """
+    Convert a Medium article into audio using ElevenLabs.
+    Args:
+        url: The URL of the article.
+        voice_id: The ElevenLabs voice ID to use (default: 'JBFqnCBsd6RMkjVDRZzb' - George).
+    """
+    api_key = os.environ.get("ELEVENLABS_API_KEY")
+    if not api_key:
+        return "Error: ELEVENLABS_API_KEY not set."
+    if ctx:
+        await ctx.info(f"Scraping article for audio: {url}...")
+    try:
+        article = await scraper.scrape_article(url)
+        if not article:
+            return "Error: Failed to scrape article (returned None)."
+        text = article.get("markdownContent", "")
+        title = article.get("title") or "Article"
+        author_data = article.get("author")
+        author = author_data.get("name") if (author_data and isinstance(author_data, dict)) else None
+        publication = article.get("publication")  # New: get publication separately
+        if ctx:
+            await ctx.info("Scraping complete. Processing text...")
+        # Handle missing/blocked content
+        if not text or "Could not extract" in text or "Verify you are human" in text:
+            # Try to construct a fallback script
+            # If text starts with "Summary:", it means we got the meta description
+            description = ""
+            if text and text.startswith("Summary:"):
+                description = text.replace("Summary:", "").strip()
+            # Build attribution line
+            attribution = f"Title: {title}."
+            if author:
+                attribution += f" By {author}."
+            elif publication:
+                attribution += f" Published by {publication}."
+            text = attribution + " "
+            if description:
+                text += f"Here is a summary: {description}. "
+                text += "I could not retrieve the full text due to access restrictions, but I hope this summary is helpful."
+            else:
+                text += "I could not retrieve the full text of this article due to access restrictions, but I encourage you to read it on Medium."
+        # Final validation
+        if not text or len(text.strip()) < 10:
+             return "Error: No text available to generate audio."
+        # Truncate for TTS (save cost/time)
+        if len(text) > 2500:
+            text = text[:2500] + "... (end of preview)"
+        import uuid
+        output_filename = f"output_{uuid.uuid4().hex}.mp3"
+        output_path = os.path.join(os.path.dirname(__file__), output_filename)
+        if ctx:
+            await ctx.info(f"Text prepared ({len(text)} chars). Starting audio generation...")
+        # 1. Try Edge-TTS (Free, High Quality)
+        try:
+            if ctx:
+                await ctx.info("Generating audio with Edge-TTS (Free)...")
+            # Voice: en-US-ChristopherNeural (Male) or en-US-AriaNeural (Female)
+            communicate = edge_tts.Communicate(text, "en-US-ChristopherNeural")
+            await communicate.save(output_path)
+            # Verify file
+            if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
+                return f"Audio generated successfully (via Edge-TTS): {output_path}"
+            else:
+                raise Exception("Edge-TTS generated empty file.")
+        except Exception as e:
+            if ctx:
+                await ctx.info(f"Edge-TTS failed: {e}. Falling back to ElevenLabs...")
+        # 2. Fallback: ElevenLabs
+        api_key = os.environ.get("ELEVENLABS_API_KEY")
+        if api_key:
+            try:
+                if ctx:
+                    await ctx.info("Generating audio with ElevenLabs...")
+                # Run blocking I/O in thread
+                def _run_elevenlabs():
+                    client = ElevenLabs(api_key=api_key)
+                    audio_generator = client.text_to_speech.convert(
+                        text=text,
+                        voice_id=voice_id,
+                        model_id="eleven_multilingual_v2",
+                        output_format="mp3_44100_128",
+                    )
+                    with open(output_path, "wb") as f:
+                        for chunk in audio_generator:
+                            f.write(chunk)
+                await asyncio.to_thread(_run_elevenlabs)
+                if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
+                    return f"Audio generated successfully: {output_path}"
+            except Exception as e:
+                error_msg = str(e)
+                if "quota_exceeded" in error_msg:
+                    if ctx:
+                        await ctx.info("ElevenLabs quota exceeded. Falling back to OpenAI TTS...")
+                else:
+                    if ctx:
+                        await ctx.info(f"ElevenLabs failed: {e}. Falling back to OpenAI TTS...")
+        # 3. Fallback: OpenAI
+        openai_key = os.environ.get("OPENAI_API_KEY")
+        if not openai_key:
+             return "Error: Edge-TTS failed, ElevenLabs failed/missing, and OPENAI_API_KEY not set."
+        try:
+            if ctx:
+                await ctx.info("Generating audio with OpenAI TTS...")
+            client = AsyncOpenAI(api_key=openai_key)
+            response = await client.audio.speech.create(
+                model="tts-1",
+                voice="alloy",
+                input=text[:4096] # OpenAI limit
+            )
+            response.stream_to_file(output_path)
+            return f"Audio generated successfully (via OpenAI Fallback): {output_path}"
+        except Exception as e2:
+            if "insufficient_quota" in str(e2) or "429" in str(e2):
+                    return "Error: All TTS services (Edge-TTS, ElevenLabs, OpenAI) failed or quotas exceeded."
+            return f"Error generating audio (All fallbacks failed): {str(e2)}"
+    except Exception as e:
+        return f"Error generating audio: {str(e)}"
+@mcp.tool()
+async def medium_synthesize(topic: str, ctx: Context = None) -> str:
+    """
+    Synthesize a 'State of the Union' report on a topic using top Medium articles.
+    Args:
+        topic: The topic to analyze (e.g. "Generative AI").
+    """
+    api_key = os.environ.get("OPENAI_API_KEY")
+    # We check for OpenAI key as a baseline, but Gemini key is checked inside
+    if not api_key and not os.environ.get("GEMINI_API_KEY"):
+        return "Error: Neither OPENAI_API_KEY nor GEMINI_API_KEY is set."
+    if ctx:
+        await ctx.info(f"Scraping top articles for: {topic}...")
+    # 1. Scrape Top Articles
+    articles = await scraper.scrape_search(topic, max_articles=5, progress_callback=ctx.info if ctx else None)
+    if not articles:
+        return "No articles found to synthesize."
+    # 2. Prepare Context for LLM (Parallel Scraping)
+    if ctx:
+        await ctx.info(f"Deep scraping {len(articles)} articles in parallel...")
+    async def _scrape_single_article(art):
+        url = art.get('url')
+        title = art.get('title')
+        author = art.get('author', {}).get('name')
+        content = ""
+        try:
+            full_art = await scraper.scrape_article(url)
+            content = full_art.get("markdownContent", "")
+            if "Could not extract" in content:
+                content = ""
+        except Exception:
+            content = ""
+        if not content:
+            content = f"Summary: {title} by {author}. (Full content unavailable)."
+        # Truncate
+        content = content[:2000] + "..." if len(content) > 2000 else content
+        return f"\nTitle: {title}\nURL: {url}\nAuthor: {author}\nContent:\n{content}\n"
+    # Run all scrapes concurrently
+    results = await asyncio.gather(*[_scrape_single_article(art) for art in articles])
+    context_text = "".join(results)
+    if ctx:
+        await ctx.info("Synthesizing insights...")
+    # 3. Call LLM (Gemini First)
+    gemini_key = os.environ.get("GEMINI_API_KEY")
+    report = ""
+    # Try Gemini
+    if gemini_key:
+        try:
+            if ctx:
+                await ctx.info("Synthesizing with Gemini (2.5-Flash)...")
+            genai.configure(api_key=gemini_key)
+            model = genai.GenerativeModel('gemini-2.5-flash')
+            prompt = f"""You are a tech analyst. Synthesize the following Medium articles into a 'State of the Union' report.
+            Highlight trends, key players, and sentiment.
+            Topic: {topic}
+            Articles:
+            {context_text}
+            """
+            # Gemini async generation
+            response = await model.generate_content_async(prompt)
+            report = response.text
+            return report
+        except Exception as e:
+            if ctx:
+                await ctx.info(f"Gemini failed: {e}. Falling back to OpenAI...")
+    # Fallback: OpenAI
+    if not api_key:
+         return f"Error: Gemini failed/missing and OPENAI_API_KEY not set. Gemini Error: {report if 'report' in locals() else 'N/A'}"
+    client = AsyncOpenAI(api_key=api_key)
+    try:
+        response = await client.chat.completions.create(
+            model="gpt-4o",
+            messages=[
+                {"role": "system", "content": "You are a tech analyst. Synthesize the following Medium articles into a 'State of the Union' report. Highlight trends, key players, and sentiment."},
+                {"role": "user", "content": f"Topic: {topic}\n\nArticles:\n{context_text}"}
+            ]
+        )
+        report = response.choices[0].message.content
+    except Exception as e:
+        if "insufficient_quota" in str(e) or "429" in str(e):
+            if ctx:
+                await ctx.info("OpenAI quota exceeded. Generating summary report locally...")
+            # Mock Fallback: Generate a report based on titles and authors
+            report = f"# State of the Union: {topic} (Generated Locally)\n\n"
+            report += "> **Note:** External AI services (Gemini & OpenAI) are currently unavailable or quota exceeded. This report is generated based on available metadata.\n\n"
+            report += "## Key Articles Analyzed\n"
+            for i, art in enumerate(articles):
+                report += f"- **{art.get('title')}** by {art.get('author', {}).get('name')}\n"
+            report += "\n## Summary\n"
+            report += f"Recent discussions on **{topic}** focus on the themes presented in the articles above. "
+            report += "Readers are encouraged to explore the full articles for in-depth analysis."
+        else:
+            report = f"Error generating report: {str(e)}"
+    return report
+# --- Prompts ---
+@mcp.prompt()
+def summarize_article(url: str) -> str:
+    """Create a prompt to summarize a Medium article."""
+    return f"Please read and summarize the following Medium article: {url}\n\nFocus on the key takeaways and novel insights."
+@mcp.prompt()
+def tweet_thread(url: str) -> str:
+    """Create a prompt to turn an article into a Twitter thread."""
+    return f"Read this article: {url}\n\nConvert it into a viral 5-tweet thread. Use emojis and keep it punchy."
+# --- Completions (Autocomplete) ---
+COMMON_TAGS = [
+    "Artificial Intelligence", "Machine Learning", "Data Science", "Programming",
+    "Python", "JavaScript", "Startup", "Technology", "Writing", "Life Lessons",
+    "Productivity", "Design", "Marketing", "Business", "Health"
+]
+@mcp.tool()
+async def medium_search_with_autocomplete(
+    tag: str,
+    ctx: Context = None
+) -> str:
+    """
+    Search Medium with tag autocomplete support.
+    """
+    # Note: True autocomplete requires client-side support via the completion API,
+    # which FastMCP handles for Enums. For open strings, we provide this tool
+    # as a hint for future expansion.
+    return await medium_fresh(tag)
+if __name__ == "__main__":
+    mcp.run()

src/__init__.py ADDED Viewed

File without changes

src/__main__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+import asyncio
+from .main import main
+# Execute the Actor entry point.
+asyncio.run(main())

src/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (147 Bytes). View file

src/__pycache__/__main__.cpython-313.pyc ADDED Viewed

Binary file (269 Bytes). View file

src/__pycache__/config.cpython-313.pyc ADDED Viewed

Binary file (3.76 kB). View file

src/__pycache__/main.cpython-313.pyc ADDED Viewed

Binary file (9.89 kB). View file

src/__pycache__/parser.cpython-313.pyc ADDED Viewed

Binary file (10.6 kB). View file

src/__pycache__/service.cpython-313.pyc ADDED Viewed

Binary file (18.3 kB). View file

src/__pycache__/state.cpython-313.pyc ADDED Viewed

Binary file (4 kB). View file

src/__pycache__/utils.cpython-313.pyc ADDED Viewed

Binary file (683 Bytes). View file

src/config.py ADDED Viewed

	@@ -0,0 +1,86 @@

+from pydantic import BaseModel, Field
+from typing import Optional, List, Dict, Any
+from apify import Actor
+class ActorInput(BaseModel):
+    """
+    Defines the expected input schema for the Actor.
+    """
+    search_query: Optional[str] = Field(
+        default=None,
+        alias="searchQuery",
+        description="Search query to run on Medium."
+    )
+    tag: Optional[str] = Field(
+        default=None,
+        alias="tag",
+        description="Topic tag to scrape (e.g. 'AI Agents')."
+    )
+    start_urls: List[Dict[str, Any]] = Field(
+        default_factory=lambda: [{"url": "https://medium.com"}],
+        alias="start_urls",
+        description="List of start URLs."
+    )
+    max_requests_per_crawl: int = Field(
+        default=100,
+        alias="maxRequestsPerCrawl",
+        description="Maximum number of requests to process."
+    )
+    min_concurrency: int = Field(
+        default=1,
+        alias="minConcurrency",
+        description="Minimum number of parallel requests."
+    )
+    max_concurrency: int = Field(
+        default=10,
+        alias="maxConcurrency",
+        description="Maximum number of parallel requests."
+    )
+    max_request_retries: int = Field(
+        default=3,
+        alias="maxRequestRetries",
+        description="Number of retries for failed requests."
+    )
+    max_articles: int = Field(
+        default=5,
+        alias="maxArticles",
+        description="Maximum number of articles to scrape from search results."
+    )
+    scrape_full_content: bool = Field(
+        default=False,
+        alias="scrapeFullContent",
+        description="If True, visit article pages and extract full content."
+    )
+    enable_deduplication: bool = Field(
+        default=True,
+        alias="enableDeduplication",
+        description="If True, skip previously seen articles."
+    )
+    proxy_configuration: Optional[Dict[str, Any]] = Field(
+        default=None,
+        alias="proxyConfiguration",
+        description="Proxy configuration settings."
+    )
+    @classmethod
+    async def load(cls) -> "ActorInput":
+        """
+        Loads input from the Actor and validates it against the schema.
+        """
+        actor_input = await Actor.get_input()
+        if not actor_input:
+            import json
+            import os
+            from pathlib import Path
+            cwd = Path.cwd()
+            local_input = cwd / "local_input.json"
+            Actor.log.info(f"Checking for local input at: {local_input}")
+            if local_input.exists():
+                Actor.log.info(f"Loading input from local file: {local_input}")
+                actor_input = json.loads(local_input.read_text(encoding="utf-8"))
+            else:
+                Actor.log.warning(f"Local input not found at {local_input}")
+                actor_input = {}
+        Actor.log.info(f"Raw Actor Input: {actor_input}")
+        return cls(**actor_input)

src/main.py ADDED Viewed

	@@ -0,0 +1,180 @@

+from bs4 import BeautifulSoup
+from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
+from crawlee import Request
+from apify import Actor
+import asyncio
+from datetime import timedelta
+from src.config import ActorInput
+from src.parser import extract_search_results, extract_article_content
+from src.utils import block_resources
+from src.state import StateManager
+async def main():
+    await Actor.init()
+    try:
+        # Load and validate input
+        actor_input = await ActorInput.load()
+        Actor.log.info(f"Loaded Input: {actor_input}")
+        # Initialize State Manager
+        state_manager = None
+        if actor_input.enable_deduplication:
+            state_manager = StateManager()
+            await state_manager.load_state()
+        else:
+            Actor.log.info("Deduplication disabled. Scraping all articles.")
+        # Build start URLs
+        start_urls = []
+        if actor_input.tag:
+            # Tag-based scraping (Freshness)
+            tag_slug = actor_input.tag.lower().replace(" ", "-")
+            start_urls.append(f"https://medium.com/tag/{tag_slug}/latest")
+            Actor.log.info(f"Targeting Tag: {actor_input.tag} (Latest)")
+        elif actor_input.search_query:
+            # Search-based scraping
+            q = actor_input.search_query.replace(" ", "+")
+            start_urls.append(f"https://medium.com/search?q={q}")
+        # Add explicit start URLs if provided
+        for u in actor_input.start_urls:
+             if u.get("url") and "medium.com" in u["url"]:
+                 start_urls.append(u["url"])
+        if not start_urls:
+            Actor.log.info("No search query or valid start URLs provided. Exiting.")
+            await Actor.exit()
+            return
+        # Create proxy configuration
+        proxy_config = None
+        if actor_input.proxy_configuration:
+            proxy_config = await Actor.create_proxy_configuration(
+                actor_proxy_input=actor_input.proxy_configuration
+            )
+        crawler = PlaywrightCrawler(
+            proxy_configuration=proxy_config,
+            max_requests_per_crawl=actor_input.max_requests_per_crawl,
+            max_request_retries=actor_input.max_request_retries,
+            request_handler_timeout=timedelta(seconds=60),
+        )
+        @crawler.router.default_handler
+        async def handler(context: PlaywrightCrawlingContext):
+            url = context.request.url
+            Actor.log.info(f"Processing: {url}")
+            # Enable resource blocking
+            await context.page.route("**/*", block_resources)
+            # Wait for content
+            try:
+                Actor.log.info("Waiting for selectors...")
+                await context.page.wait_for_load_state("domcontentloaded")
+                await context.page.wait_for_selector("article, .postArticle, .js-block", timeout=10000)
+                Actor.log.info("Selectors found.")
+            except Exception as e:
+                Actor.log.warning(f"Timeout waiting for selectors on {url}: {e}")
+            # Parse Content
+            html = await context.page.content()
+            soup = BeautifulSoup(html, "html.parser")
+            # --- Router Logic ---
+            # 1. Article Page (Deep Scraping)
+            if context.request.label == "ARTICLE" or "/@/" in url or "-{12}" in url:
+                 Actor.log.info(f"Scraping Article Content: {url}")
+                 user_data = context.request.user_data
+                 if not isinstance(user_data, dict):
+                     user_data = {}
+                 try:
+                     loop = asyncio.get_running_loop()
+                     content_data = await loop.run_in_executor(None, extract_article_content, soup)
+                     Actor.log.info(f"Extracted content keys: {list(content_data.keys())}")
+                     if content_data.get("markdownContent"):
+                         Actor.log.info(f"Markdown length: {len(content_data['markdownContent'])}")
+                     else:
+                         Actor.log.warning("No markdown content extracted.")
+                 except Exception as e:
+                     Actor.log.error(f"Error extracting content: {e}")
+                     content_data = {}
+                 # Merge metadata
+                 final_data = user_data.copy()
+                 final_data.update({
+                     "url": url,
+                     "title": final_data.get("title") or (soup.title.string if soup.title else None),
+                     **content_data
+                 })
+                 await context.push_data(final_data)
+            # 2. Search Page or Tag Page
+            elif "medium.com/search" in url or "/tag/" in url:
+                Actor.log.info(f"Scraping Listing Page: {url}")
+                loop = asyncio.get_running_loop()
+                results = await loop.run_in_executor(None, extract_search_results, soup, url)
+                Actor.log.info(f"Found {len(results)} articles.")
+                pushed = 0
+                for rec in results:
+                    if pushed >= actor_input.max_articles:
+                        break
+                    full_url = rec["url"]
+                    # Deduplication Check
+                    if state_manager and state_manager.is_seen(full_url):
+                        Actor.log.info(f"Skipping seen URL: {full_url}")
+                        continue
+                    # Add to state
+                    if state_manager:
+                        state_manager.add_seen(full_url)
+                    if actor_input.scrape_full_content:
+                        # Enqueue for deep scraping
+                        await context.add_requests([Request.from_url(
+                            url=full_url,
+                            label="ARTICLE",
+                            user_data={
+                                "title": rec.get("title"),
+                                "author": rec.get("author"),
+                                "publishingDate": rec.get("publishingDate"),
+                                "readingTime": rec.get("readingTime"),
+                                "search_query": actor_input.search_query
+                            }
+                        )])
+                    else:
+                        # Fast mode
+                        await context.push_data(rec)
+                    pushed += 1
+                # Push search page summary
+                await context.push_data({
+                    "type": "search_page",
+                    "url": url,
+                    "enqueued": pushed
+                })
+        Actor.log.info(f"Starting crawler with URLs: {start_urls}")
+        await crawler.run(start_urls)
+    except Exception as e:
+        Actor.log.error(f"Crawler failed: {e}")
+        raise
+    finally:
+        if state_manager:
+            await state_manager.save_state()
+        await Actor.exit()
+if __name__ == "__main__":
+    asyncio.run(main())

src/parser.py ADDED Viewed

	@@ -0,0 +1,309 @@

+from bs4 import BeautifulSoup
+from typing import Dict, List, Optional, Any
+from markdownify import markdownify as md
+from urllib.parse import urljoin
+def extract_search_results(soup: BeautifulSoup, base_url: str) -> List[Dict[str, Any]]:
+    """
+    Extracts article metadata from search result cards.
+    """
+    results = []
+    # Selectors for article cards
+    # Try multiple selectors as Medium's DOM changes
+    cards = soup.select("article") or \
+            soup.select('div[role="article"]') or \
+            soup.select(".postArticle") or \
+            soup.select(".js-block")
+    for card in cards:
+        data = _extract_from_card(card, base_url)
+        if data.get("url"):
+            results.append(data)
+    return results
+def _extract_from_card(card, base_url: str) -> Dict[str, Any]:
+    """Helper to extract data from a single card element."""
+    # 1. URL & Title
+    # Look for <a> tags that link to the article
+    # Usually the first <h2> inside an <a> is the title
+    title_tag = card.find("h2")
+    title = title_tag.get_text(strip=True) if title_tag else None
+    # Find the link associated with the title or the card
+    link_tag = card.find("a", href=True)
+    if title_tag and title_tag.find_parent("a"):
+        link_tag = title_tag.find_parent("a")
+    url = None
+    if link_tag:
+        href = link_tag["href"]
+        # Clean up URL (remove query params usually)
+        if "?" in href:
+            href = href.split("?")[0]
+        url = urljoin(base_url, href)
+    # 2. Author
+    # Heuristic: Look for links that go to a user profile (/@username or /u/username)
+    # but aren't the main article link.
+    author = None
+    # Try specific selectors first
+    author_tag = card.select_one('a[data-action="show-user-card"]') or \
+                 card.select_one('.ds-link') or \
+                 card.select_one('a[href*="/@"]')
+    if author_tag:
+        # Verify it's not the title link
+        if title_tag and author_tag == title_tag.find_parent("a"):
+             pass # It's the title
+        else:
+            author = author_tag.get_text(strip=True)
+    # Fallback: Look for a <p> or <span> that contains the author name
+    # Usually it's the first piece of text in the card meta area
+    if not author:
+        # Find the meta div (often has date/read time)
+        # We look for text that is NOT the date or read time
+        for p in card.find_all(["p", "span"]):
+            txt = p.get_text(strip=True)
+            # Skip empty, date-like, or read-time strings
+            if not txt or "min read" in txt or any(m in txt for m in ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec", "ago"]):
+                continue
+            # Skip title
+            if title and txt in title:
+                continue
+            # If it looks like a name (2-3 words, capitalized), take it
+            if 0 < len(txt.split()) <= 3 and txt[0].isupper():
+                author = txt
+                break
+    # 3. Date / Reading Time
+    # Often spans
+    spans = card.find_all("span")
+    pub_date = None
+    reading_time = None
+    for s in spans:
+        txt = s.get_text(strip=True)
+        # Reading time usually ends with "min read"
+        if "min read" in txt:
+            try:
+                reading_time = float(txt.replace("min read", "").strip())
+            except ValueError:
+                pass
+        # Date heuristic: "Nov 7" or "2 days ago"
+        # Hard to parse perfectly without regex, but we can grab it if it looks like a date
+        # For now, we might skip complex date parsing or just take the first span that isn't reading time
+        elif not pub_date and len(txt) < 15 and any(c.isdigit() for c in txt):
+             # Very rough heuristic
+             pub_date = txt
+    # 4. Image URL
+    # Priority:
+    # 1. <img src="..." class="..."/> inside the card (often has specific classes for covers)
+    # 2. First <img> tag in the card
+    # Note: Search results don't always have og:image tags (those are in the head), so we must rely on the card's HTML.
+    image_url = None
+    # Try to find the main article image (often has specific classes or sizes)
+    # Medium uses responsive images, often in <picture> or <img> with srcset.
+    # We'll look for the largest image or the first one that isn't an avatar.
+    images = card.find_all("img")
+    for img in images:
+        src = img.get("src", "")
+        # Skip small avatars (often 20x20 or similar in URL)
+        if "1*dmbNkD5D-u45r44go_cf0g.png" in src: # Common default avatar
+            continue
+        if "resize:fill:20:20" in src: # Tiny thumbnail
+            continue
+        # If it's a valid image, take it.
+        # Medium images often have 'cdn-images-1.medium.com'
+        if src:
+            image_url = src
+            break
+    if not image_url:
+        # Fallback to any img
+        img_tag = card.find("img")
+        if img_tag and img_tag.get("src"):
+            image_url = img_tag["src"]
+    return {
+        "url": url,
+        "title": title,
+        "author": {"name": author} if author else None,
+        "publishingDate": pub_date,
+        "readingTime": reading_time,
+        "imageUrl": image_url,
+    }
+def extract_article_content(soup: BeautifulSoup, url: Optional[str] = None) -> Dict[str, Any]:
+    """
+    Extracts full content, claps, and responses from an article page.
+    If extraction fails (Cloudflare/paywall), falls back to URL parsing.
+    """
+    content_data = {
+        "markdownContent": None,
+        "claps": None,
+        "responses": None,
+        "title": None,
+        "author": None,
+        "publication": None  # New field to track publication separately from author
+    }
+    # Extract Title (with fallbacks)
+    # Try h1 first
+    title_tag = soup.find("h1")
+    if title_tag:
+        content_data["title"] = title_tag.get_text(strip=True)
+    # Try og:title
+    if not content_data["title"]:
+        og_title = soup.find("meta", property="og:title")
+        if og_title and og_title.get("content"):
+            content_data["title"] = og_title.get("content")
+    # Try URL parsing if title is empty or generic (Cloudflare/Medium homepage)
+    is_generic_title = content_data["title"] in [None, "", "Just a moment...", "medium.com", "Medium"]
+    if is_generic_title and url:
+        # Medium URLs are like: https://medium.com/@author/article-title-slug-hash
+        # or https://medium.com/publication/article-title-slug-hash
+        try:
+            from urllib.parse import urlparse
+            path_parts = urlparse(url).path.strip("/").split("/")
+            if len(path_parts) >= 2:
+                # Last part is the article slug
+                article_slug = path_parts[-1]
+                # Remove hash (last part after last hyphen if it's alphanumeric)
+                slug_parts = article_slug.rsplit("-", 1)
+                if len(slug_parts) > 1 and len(slug_parts[-1]) == 12:  # Medium hash is 12 chars
+                    article_slug = slug_parts[0]
+                # Convert slug to title: replace-hyphens-with-spaces
+                title = article_slug.replace("-", " ").title()
+                content_data["title"] = title
+        except Exception:
+            pass
+    # Last resort: Try page title element (will be "medium.com" or "Just a moment..." for Cloudflare)
+    if not content_data["title"]:
+        title_elem = soup.find("title")
+        if title_elem:
+            page_title = title_elem.get_text(strip=True)
+            # Only use if it's not a Cloudflare/generic page
+            if page_title and page_title not in ["Just a moment...", "medium.com", "Medium"]:
+                content_data["title"] = page_title
+    # Extract Author
+    # Meta tag is reliable: <meta name="author" content="...">
+    meta_author = soup.find("meta", attrs={"name": "author"})
+    if meta_author and meta_author.get("content"):
+        content_data["author"] = {"name": meta_author.get("content")}
+    else:
+        # Fallback to selectors
+        author_tag = soup.select_one('a[data-action="show-user-card"]') or soup.select_one('.ds-link')
+        if author_tag:
+             author_text = author_tag.get_text(strip=True)
+             if author_text:  # Only set if we got actual text
+                 content_data["author"] = {"name": author_text}
+    # Extract publication or author from URL (metadata extraction)
+    if url:
+        try:
+            from urllib.parse import urlparse
+            path_parts = urlparse(url).path.strip("/").split("/")
+            if len(path_parts) >= 1:
+                first_part = path_parts[0]
+                # Check for @username format (personal blog)
+                if first_part.startswith("@"):
+                    username = first_part[1:]  # Remove @ symbol
+                    formatted_name = username.replace("-", " ").title()
+                    # If we don't have an author yet, use the username
+                    if not content_data["author"]:
+                        content_data["author"] = {"name": formatted_name}
+                # Otherwise it's a publication name (like "ai-in-plain-english")
+                else:
+                    pub_name = first_part.replace("-", " ").title()
+                    content_data["publication"] = pub_name
+                    # Only use publication as author if we have absolutely no author info
+                    # (Note: This is not ideal but better than nothing for blocked pages)
+        except Exception:
+            pass
+    # Pre-extract og:description for fallback (before attempting main extraction)
+    og_description = soup.find("meta", property="og:description")
+    fallback_description = og_description.get("content") if og_description else None
+    # Extract Claps
+    try:
+        clap_el = soup.select_one('button[data-testid="clapCount"]') or soup.select_one('.clapCount')
+        if clap_el:
+            txt = clap_el.get_text(strip=True)
+            if "K" in txt:
+                content_data["claps"] = int(float(txt.replace("K", "")) * 1000)
+            else:
+                content_data["claps"] = int(txt)
+    except Exception:
+        pass
+    # Extract Responses
+    try:
+        resp_el = soup.select_one('button[data-testid="responsesCount"]') or soup.select_one('.responsesCount')
+        if resp_el:
+             txt = resp_el.get_text(strip=True)
+             content_data["responses"] = int(txt)
+    except Exception:
+        pass
+    # Extract Content
+    article = soup.find("article") or soup.find("section")
+    if article:
+        # Remove clutter
+        for tag in article.select("button, .speechify-btn, .metabar, footer"):
+            tag.decompose()
+        html = str(article)
+        content_data["markdownContent"] = md(html, heading_style="ATX")
+    # Fallback 1: Try to extract first few paragraphs (intro/header) even if article tag failed
+    if not content_data["markdownContent"] or len(content_data["markdownContent"]) < 100:
+        # Look for any paragraphs in the page (might be intro text that loaded before paywall)
+        paragraphs = soup.find_all("p")
+        if paragraphs:
+            # Get first 3-5 paragraphs that have substantial content
+            intro_text = []
+            for p in paragraphs[:10]:  # Check first 10 paragraphs
+                text = p.get_text(strip=True)
+                # Skip short paragraphs (likely meta info) and certain patterns
+                if len(text) > 50 and "min read" not in text.lower() and "ago" not in text:
+                    intro_text.append(text)
+                if len(intro_text) >= 3:  # Got enough intro paragraphs
+                    break
+            if intro_text:
+                combined_intro = "\n\n".join(intro_text)
+                if not content_data["markdownContent"]:
+                    content_data["markdownContent"] = combined_intro
+                else:
+                    # Append intro to existing content if it was too short
+                    content_data["markdownContent"] += "\n\n" + combined_intro
+    # Fallback 2: Meta Description (if still no content)
+    if not content_data["markdownContent"] or len(content_data["markdownContent"]) < 50:
+        if fallback_description:
+            desc_text = f"Summary: {fallback_description}"
+            if content_data["markdownContent"]:
+                content_data["markdownContent"] = desc_text + "\n\n" + content_data["markdownContent"]
+            else:
+                content_data["markdownContent"] = desc_text
+        else:
+            # Last resort: try name="description"
+            meta_desc = soup.find("meta", attrs={"name": "description"})
+            if meta_desc:
+                content_data["markdownContent"] = f"Summary: {meta_desc.get('content', '')}"
+    return content_data

src/py.typed ADDED Viewed

File without changes

src/service.py ADDED Viewed

	@@ -0,0 +1,279 @@

+import asyncio
+from playwright.async_api import async_playwright, Browser, BrowserContext, Page
+from bs4 import BeautifulSoup
+from typing import List, Dict, Any, Optional, Callable, Awaitable
+import logging
+# Reuse existing logic
+from src.parser import extract_search_results, extract_article_content
+from src.utils import block_resources
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger("ScraperService")
+class ScraperWorker:
+    """
+    A single worker that manages its own BrowserContext.
+    """
+    def __init__(self, worker_id: int, browser: Browser):
+        self.worker_id = worker_id
+        self.browser = browser
+        self.context: Optional[BrowserContext] = None
+        self._lock = asyncio.Lock()
+    async def ensure_context(self):
+        """Ensures this worker has an open context."""
+        async with self._lock:
+            if not self.context:
+                logger.info(f"[Worker {self.worker_id}] Creating context...")
+                self.context = await self.browser.new_context()
+                # Optional: Block resources globally for this context if possible,
+                # but route is usually per-page.
+    async def scrape_search(self, query: str, max_articles: int, progress_callback: Optional[Callable[[str], Awaitable[None]]] = None) -> List[Dict[str, Any]]:
+        """Scrapes search results using this worker's context."""
+        if progress_callback:
+            await progress_callback(f"[Worker {self.worker_id}] Starting search for '{query}'...")
+        await self.ensure_context()
+        page = await self.context.new_page()
+        try:
+            # Block resources
+            # await page.route("**/*", block_resources)
+            # Set User Agent
+            await page.set_extra_http_headers({
+                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
+            })
+            url = f"https://medium.com/search?q={query.replace(' ', '+')}"
+            logger.info(f"[Worker {self.worker_id}] Navigating to: {url}")
+            if progress_callback:
+                await progress_callback(f"[Worker {self.worker_id}] Navigating to {url}...")
+            await page.goto(url, wait_until="domcontentloaded")
+            try:
+                await page.wait_for_selector("article, div[role='article'], .postArticle, .js-block", timeout=30000)
+            except Exception as e:
+                logger.warning(f"[Worker {self.worker_id}] Timeout waiting for selectors: {e}")
+            html = await page.content()
+            soup = BeautifulSoup(html, "html.parser")
+            results = extract_search_results(soup, url)
+            return results[:max_articles]
+        finally:
+            await page.close()
+    async def scrape_tag(self, tag: str, max_articles: int, progress_callback: Optional[Callable[[str], Awaitable[None]]] = None) -> List[Dict[str, Any]]:
+        """Scrapes tag results using this worker's context."""
+        if progress_callback:
+            await progress_callback(f"[Worker {self.worker_id}] Starting tag scrape for '{tag}'...")
+        await self.ensure_context()
+        page = await self.context.new_page()
+        try:
+            # Set User Agent
+            await page.set_extra_http_headers({
+                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
+            })
+            tag_slug = tag.lower().replace(" ", "-")
+            url = f"https://medium.com/tag/{tag_slug}"
+            logger.info(f"[Worker {self.worker_id}] Navigating to: {url}")
+            if progress_callback:
+                await progress_callback(f"[Worker {self.worker_id}] Navigating to {url}...")
+            await page.goto(url, wait_until="domcontentloaded")
+            try:
+                await page.wait_for_selector("article, div[role='article'], .postArticle, .js-block", timeout=30000)
+            except Exception as e:
+                logger.warning(f"[Worker {self.worker_id}] Timeout waiting for selectors: {e}")
+            html = await page.content()
+            soup = BeautifulSoup(html, "html.parser")
+            results = extract_search_results(soup, url)
+            return results[:max_articles]
+        finally:
+            await page.close()
+    async def close(self):
+        if self.context:
+            await self.context.close()
+    async def scrape_article(self, url: str) -> Dict[str, Any]:
+        """Scrapes full article content using this worker's context."""
+        await self.ensure_context()
+        page = await self.context.new_page()
+        try:
+            # Stealth: Remove webdriver property
+            await page.add_init_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
+            # Set standard User Agent
+            await page.set_extra_http_headers({
+                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
+            })
+            logger.info(f"[Worker {self.worker_id}] Navigating to article: {url}")
+            # Anti-bot: Go to homepage first to set cookies
+            await page.goto("https://medium.com/", wait_until="domcontentloaded")
+            await page.wait_for_timeout(1500)
+            # Navigate to article
+            await page.goto(url, wait_until="domcontentloaded", timeout=20000)
+            # Wait longer for dynamic content to load
+            await page.wait_for_timeout(3000)
+            try:
+                # Wait for the main article content
+                # Try multiple selectors
+                await page.wait_for_selector("article, section, main, div[role='main'], h1", timeout=20000)
+                # Scroll to bottom to trigger lazy loading
+                for i in range(5):
+                    await page.evaluate("window.scrollBy(0, window.innerHeight)")
+                    await page.wait_for_timeout(500)
+                # Wait a bit more after scrolling
+                await page.wait_for_timeout(2000)
+            except Exception as e:
+                logger.warning(f"[Worker {self.worker_id}] Timeout waiting for article selectors: {e}")
+                # Fallback: Try Google Cache (Text Only)
+                try:
+                    logger.info(f"[Worker {self.worker_id}] Trying Google Cache for: {url}")
+                    cache_url = f"http://webcache.googleusercontent.com/search?q=cache:{url}&strip=1"
+                    await page.goto(cache_url, wait_until="domcontentloaded", timeout=15000)
+                    # Google Cache (Text Only) usually puts content in <pre> or just body
+                    # We'll let the standard extractor try, or just grab body
+                    await page.wait_for_selector("body", timeout=5000)
+                except Exception as e2:
+                    logger.warning(f"[Worker {self.worker_id}] Google Cache failed: {e2}")
+            html = await page.content()
+            soup = BeautifulSoup(html, "html.parser")
+            content = extract_article_content(soup, url=url)  # Pass URL for fallback parsing
+            content["html_debug"] = html # For debugging
+            # Fallback if markdown is empty
+            if not content.get("markdownContent"):
+                # Try to just get text from body as a last resort
+                body = soup.find("body")
+                if body:
+                    text = body.get_text(separator="\n", strip=True)
+                    # Clean up a bit
+                    if len(text) > 500:
+                        content["markdownContent"] = text[:5000] # Limit fallback text
+                    else:
+                         content["markdownContent"] = "Could not extract article content. It might be behind a paywall or login."
+            return content
+        finally:
+            await page.close()
+class ScraperService:
+    """
+    Manages a pool of ScraperWorkers for concurrent scraping.
+    """
+    def __init__(self, max_workers: int  = 5, headless: bool = True):
+        self.max_workers = max_workers
+        self.headless = headless
+        self.playwright = None
+        self.browser: Optional[Browser] = None
+        self.workers: List[ScraperWorker] = []
+        self.worker_queue = asyncio.Queue()
+        self._initialized = False
+        self._lock = asyncio.Lock()
+    async def ensure_initialized(self):
+        """Starts Playwright, Browser, and Workers."""
+        async with self._lock:
+            # Check if browser is alive
+            if self.browser and not self.browser.is_connected():
+                logger.warning("Browser is disconnected. Restarting...")
+                await self.close()
+                self._initialized = False
+                self.workers = []
+                self.worker_queue = asyncio.Queue()
+            if self._initialized:
+                return
+            logger.info("Initializing Scraper Service...")
+            self.playwright = await async_playwright().start()
+            self.browser = await self.playwright.chromium.launch(headless=self.headless)
+            # Create Workers
+            self.workers = [] # Reset workers
+            for i in range(self.max_workers):
+                worker = ScraperWorker(i, self.browser)
+                self.workers.append(worker)
+                await self.worker_queue.put(worker)
+            self._initialized = True
+            logger.info(f"Initialized {self.max_workers} workers.")
+    async def _get_worker(self) -> ScraperWorker:
+        """Retrieves a free worker from the queue."""
+        # Check connection before getting worker
+        if self.browser and not self.browser.is_connected():
+             logger.warning("Browser disconnected in _get_worker. Re-initializing...")
+             await self.ensure_initialized()
+        await self.ensure_initialized()
+        return await self.worker_queue.get()
+    async def _release_worker(self, worker: ScraperWorker):
+        """Returns a worker to the queue."""
+        await self.worker_queue.put(worker)
+    async def scrape_search(self, query: str, max_articles: int = 5, progress_callback: Optional[Callable[[str], Awaitable[None]]] = None) -> List[Dict[str, Any]]:
+        """Delegates search to a free worker."""
+        worker = await self._get_worker()
+        try:
+            return await worker.scrape_search(query, max_articles, progress_callback)
+        finally:
+            await self._release_worker(worker)
+    async def scrape_tag(self, tag: str, max_articles: int = 5, progress_callback: Optional[Callable[[str], Awaitable[None]]] = None) -> List[Dict[str, Any]]:
+        """Delegates tag scrape to a free worker."""
+        worker = await self._get_worker()
+        try:
+            return await worker.scrape_tag(tag, max_articles, progress_callback)
+        finally:
+            await self._release_worker(worker)
+    async def close(self):
+        """Closes all workers and the browser."""
+        logger.info("Closing ScraperService...")
+        for worker in self.workers:
+            await worker.close()
+        if self.browser:
+            await self.browser.close()
+        if self.playwright:
+            await self.playwright.stop()
+    async def scrape_article(self, url: str) -> Dict[str, Any]:
+        """Delegates article scrape to a free worker."""
+        worker = await self._get_worker()
+        try:
+            return await worker.scrape_article(url)
+        finally:
+            await self._release_worker(worker)

src/state.py ADDED Viewed

	@@ -0,0 +1,69 @@

+from apify import Actor
+from typing import Set
+class StateManager:
+    """
+    Manages the persistent state of the actor, specifically for deduplication.
+    """
+    def __init__(self):
+        self.seen_urls: Set[str] = set()
+    async def load_state(self):
+        """
+        Loads the state from the default key-value store.
+        """
+        state = await Actor.get_value("STATE")
+        if not state:
+            # Fallback for local development
+            import json
+            from pathlib import Path
+            cwd = Path.cwd()
+            local_state = cwd / "local_state.json"
+            Actor.log.info(f"Checking for local state at: {local_state}")
+            if local_state.exists():
+                try:
+                    state = json.loads(local_state.read_text(encoding="utf-8"))
+                    Actor.log.info(f"Loaded state from local file: {local_state}")
+                except Exception as e:
+                    Actor.log.warning(f"Failed to load local state: {e}")
+                    state = {}
+            else:
+                state = {}
+        self.seen_urls = set(state.get("seen_urls", []))
+        Actor.log.info(f"Loaded state: {len(self.seen_urls)} seen URLs.")
+    async def save_state(self):
+        """
+        Saves the current state to the default key-value store.
+        """
+        state = {
+            "seen_urls": list(self.seen_urls)
+        }
+        await Actor.set_value("STATE", state)
+        # Backup for local development
+        import json
+        from pathlib import Path
+        cwd = Path.cwd()
+        local_state = cwd / "local_state.json"
+        try:
+            local_state.write_text(json.dumps(state, indent=2), encoding="utf-8")
+            Actor.log.info(f"Backed up state to local file: {local_state}")
+        except Exception as e:
+            Actor.log.warning(f"Failed to backup local state: {e}")
+        Actor.log.info(f"Saved state: {len(self.seen_urls)} seen URLs.")
+    def is_seen(self, url: str) -> bool:
+        """
+        Checks if a URL has already been seen.
+        """
+        return url in self.seen_urls
+    def add_seen(self, url: str):
+        """
+        Adds a URL to the set of seen URLs.
+        """
+        self.seen_urls.add(url)

src/utils.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from apify import Actor
+async def block_resources(route):
+    """
+    Blocks unnecessary resources to speed up scraping.
+    """
+    if route.request.resource_type in ["image", "stylesheet", "font", "media"]:
+        await route.abort()
+    else:
+        await route.continue_()