Spaces:
Sleeping
Sleeping
File size: 5,807 Bytes
ed42ca4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 | import os, mimetypes, json, tempfile, time, pathlib
import boto3
from botocore.exceptions import BotoCoreError, ClientError
# ---------------------------------------------------------------------
# Environment
# ---------------------------------------------------------------------
AWS_REGION = os.getenv("AWS_DEFAULT_REGION", "us-east-2") # bucket/SES region
S3_BUCKET = os.getenv("AWS_S3_BUCKET")
SES_SENDER = os.getenv("SES_SENDER_EMAIL")
# Public read? (requires bucket policy and public access allowed)
AWS_S3_PUBLIC = os.getenv("AWS_S3_PUBLIC", "0").lower() in {"1", "true", "yes"}
# Simple free-tier safety rails (per day)
FREE_S3_MAX_UPLOADS = int(os.getenv("FREE_S3_MAX_UPLOADS", "10"))
FREE_S3_MAX_MB = int(os.getenv("FREE_S3_MAX_MB", "25"))
FREE_SES_MAX_EMAILS = int(os.getenv("FREE_SES_MAX_EMAILS", "10"))
# Create clients only if creds exist
s3_client = boto3.client("s3", region_name=AWS_REGION) if os.getenv("AWS_ACCESS_KEY_ID") else None
ses_client = boto3.client("ses", region_name=AWS_REGION) if os.getenv("AWS_ACCESS_KEY_ID") else None
comp_client = boto3.client("comprehend", region_name=AWS_REGION) if os.getenv("AWS_ACCESS_KEY_ID") else None
# ---------------------------------------------------------------------
# Small daily counters (temp dir) to avoid exceeding free tier
# ---------------------------------------------------------------------
_COUNTER_DIR = pathlib.Path(tempfile.gettempdir()) / "newsintel_counters"
_COUNTER_DIR.mkdir(exist_ok=True)
def _rollover_counter(name: str):
path = _COUNTER_DIR / f"{name}.json"
today = time.strftime("%Y-%m-%d")
if path.exists():
blob = json.loads(path.read_text() or "{}")
if blob.get("day") != today:
blob = {"day": today, "count": 0}
else:
blob = {"day": today, "count": 0}
return path, blob
def _bump(name: str) -> int:
path, blob = _rollover_counter(name)
blob["count"] += 1
path.write_text(json.dumps(blob))
return blob["count"]
# ---------------------------------------------------------------------
# Comprehend helpers
# ---------------------------------------------------------------------
SUPPORTED = {"en", "es", "de", "fr", "it", "pt"}
_COMP_MAX = 4500
def _safe_text(t: str) -> str:
t = (t or "").strip()
return t[:_COMP_MAX]
def detect_language(text: str) -> str:
if not comp_client:
return "en"
try:
langs = comp_client.detect_dominant_language(Text=_safe_text(text)).get("Languages", [])
lc = langs[0]["LanguageCode"] if langs else "en"
return lc if lc in SUPPORTED else "en"
except (BotoCoreError, ClientError):
return "en"
def analyze_text(text: str, lang: str = "en"):
if not comp_client:
return {"sentiment": "NEUTRAL", "entities": [], "key_phrases": []}
t = _safe_text(text)
try:
sent = comp_client.detect_sentiment(Text=t, LanguageCode=lang).get("Sentiment", "NEUTRAL")
ents = comp_client.detect_entities(Text=t, LanguageCode=lang).get("Entities", [])
keys = comp_client.detect_key_phrases(Text=t, LanguageCode=lang).get("KeyPhrases", [])
return {"sentiment": sent, "entities": ents, "key_phrases": keys}
except (BotoCoreError, ClientError):
return {"sentiment": "NEUTRAL", "entities": [], "key_phrases": []}
# ---------------------------------------------------------------------
# S3 upload (with safety guards)
# ---------------------------------------------------------------------
def s3_upload(file_path: str, key_prefix: str = "newsintel/") -> str | None:
"""
Upload a file to S3.
- If AWS_S3_PUBLIC=1 (and bucket policy allows), returns an https URL.
- Otherwise returns an s3:// URI (private).
- Free-tier guard: caps per-day uploads and file size.
"""
if not (s3_client and S3_BUCKET and file_path and os.path.isfile(file_path)):
return None
# size guard
mb = os.path.getsize(file_path) / (1024*1024)
if mb > FREE_S3_MAX_MB:
return None
# daily count guard
c_path, c_blob = _rollover_counter("s3")
if c_blob["count"] >= FREE_S3_MAX_UPLOADS:
return None
key = f"{key_prefix}{os.path.basename(file_path)}"
extra = {"ContentType": mimetypes.guess_type(file_path)[0] or "application/octet-stream"}
if AWS_S3_PUBLIC:
extra["ACL"] = "public-read"
try:
s3_client.upload_file(file_path, S3_BUCKET, key, ExtraArgs=extra)
_bump("s3")
if AWS_S3_PUBLIC:
return f"https://{S3_BUCKET}.s3.{AWS_REGION}.amazonaws.com/{key}"
else:
return f"s3://{S3_BUCKET}/{key}"
except (BotoCoreError, ClientError):
return None
# ---------------------------------------------------------------------
# SES email (with safety guard)
# ---------------------------------------------------------------------
def ses_send_email(recipient: str, subject: str, html_body: str) -> bool:
"""
Send an HTML email via SES. Requires SES_SENDER to be verified (and recipient if in sandbox).
Free-tier guard: caps per-day emails.
"""
if not (ses_client and SES_SENDER and recipient):
return False
c_path, c_blob = _rollover_counter("ses")
if c_blob["count"] >= FREE_SES_MAX_EMAILS:
return False
try:
ses_client.send_email(
Source=SES_SENDER,
Destination={"ToAddresses": [recipient]},
Message={
"Subject": {"Data": subject},
"Body": {"Html": {"Data": html_body}}
},
)
_bump("ses")
return True
except (BotoCoreError, ClientError):
return False
|