File size: 4,395 Bytes
8b425b2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import asyncio
# import aiohttp
# import os
import json
import trafilatura
from openai import AsyncOpenAI
from pydantic import BaseModel, Field
from typing import List

# --- CONFIGURATION ---
MAX_SCRAPE_CONCURRENCY = 10
MAX_AI_CONCURRENCY = 5



# --- DATA MODELS ---
class CompanyResult(BaseModel):
    name: str = Field(..., description="Name of the commercial company")
    url: str = Field(..., description="""Official website URL. Predict if missing. 
                                        If you are NOT 100% sure about the official website, 
                                        respond ONLY with:'SEARCH_REQUIRED'""")
    article_id: int = Field(..., description="The ID of the article provided in context")


class ExtractionResponse(BaseModel):
    companies: List[CompanyResult]


# --- ROBUST WORKER ---
async def process_article(url: str, article_id: int, scrape_sem, ai_sem, OPENAI_API_KEY):
    loop = asyncio.get_running_loop()

    # 1. Fetch & Extract (Using Trafilatura's robust fetcher)
    async with scrape_sem:
        try:
            # Run the synchronous fetch_url in a separate thread
            downloaded = await loop.run_in_executor(None, trafilatura.fetch_url, url)

            if downloaded is None:
                return {"url": url, "error": "Fetch failed (blocked or 404)"}

            # Extract text (also CPU bound, so runs in executor)
            text = await loop.run_in_executor(None, trafilatura.extract, downloaded)

            if not text:
                return {"url": url, "error": "No main text found"}

        except Exception as e:
            return {"url": url, "error": f"Scrape error: {str(e)}"}

    # 2. AI Extraction
    truncated_text = text[:5000]  # Trim to save tokens
    user_content = f"Article ID: {article_id}\n\nText:\n{truncated_text}"
    client = AsyncOpenAI(api_key=OPENAI_API_KEY)
    async with ai_sem:
        try:
            completion = await client.beta.chat.completions.parse(
                model="gpt-4o-mini",
                messages=[
                    {"role": "system", "content": "Extract commercial companies. Exclude generic entities, countries, government bodies."},
                    {"role": "user", "content": user_content},
                ],
                response_format=ExtractionResponse,
                temperature=0
            )

            result_obj = completion.choices[0].message.parsed

            return {
                "url": url,
                "status": "success",
                "companies": [c.model_dump() for c in result_obj.companies]
            }

        except Exception as e:
            return {"url": url, "error": f"AI error: {str(e)}"}


# --- MAIN ORCHESTRATOR ---
async def run_pipeline(urls: List[str], OPENAI_API_KEY):
    scrape_sem = asyncio.Semaphore(MAX_SCRAPE_CONCURRENCY)
    ai_sem = asyncio.Semaphore(MAX_AI_CONCURRENCY)

    print(f"🚀 Processing {len(urls)} articles...")

    # We don't need aiohttp session anymore for fetching, as Trafilatura handles it.
    tasks = [
        process_article(url, idx, scrape_sem, ai_sem, OPENAI_API_KEY)
        for idx, url in enumerate(urls)
    ]
    results = await asyncio.gather(*tasks)

    # Reporting
    success = [r for r in results if "error" not in r]
    failures = [r for r in results if "error" in r]

    print(f"\n✅ Completed: {len(success)}")
    print(f"❌ Failed:    {len(failures)}")

    if success:
        print(f"\n[Sample Output]:\n{json.dumps(success[0], indent=2)}")

    # Save to file
    with open("final_results.json", "w") as f:
        json.dump(success, f, indent=2)

    return success


def get_companies_and_articles(article_url: list, OPENAI_API_KEY):
    companies_with_articles = asyncio.run(run_pipeline(article_url, OPENAI_API_KEY))
    return companies_with_articles

# if __name__ == "__main__":
#     # REAL, LIVE URLs (Checked Feb 4, 2026)
#     live_urls = [
#         "https://newsroom.ibm.com/2026-02-04-ibm-opens-global-rfp-for-ai-driven-solutions-shaping-the-future-of-work-and-education",
#         "https://eng.lsm.lv/article/society/defence/04.02.2026-artificial-intelligence-centre-to-get-230000-euros-from-defence-budget.a633009/",
#         "https://www.unesco.org/en/articles/tech-spark-africa-advances-simulation-based-learning-skills-development"
#     ]
#
#     companies_with_articles = asyncio.run(run_pipeline(live_urls))