Spaces:
Running
Running
Commit
·
422c1f3
0
Parent(s):
Fresh repo
Browse files- Dockerfile +36 -0
- README.md +11 -0
- app.py +34 -0
- code/app.py +216 -0
- code/config.py +34 -0
- code/model.py +158 -0
- code/web_scraper_allegro.py +142 -0
- code/web_scraper_ebay.py +93 -0
- code/web_scraper_olx.py +55 -0
- requirements.txt +17 -0
Dockerfile
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.10
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
|
| 5 |
+
# Install system dependencies for Chrome
|
| 6 |
+
RUN apt-get update && apt-get install -y \
|
| 7 |
+
wget \
|
| 8 |
+
curl \
|
| 9 |
+
unzip \
|
| 10 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 11 |
+
|
| 12 |
+
# Add Google Chrome repository and install Chrome (modern approach without apt-key)
|
| 13 |
+
RUN curl https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - 2>/dev/null || \
|
| 14 |
+
(mkdir -p /etc/apt/keyrings && \
|
| 15 |
+
curl -fsSL https://dl-ssl.google.com/linux/linux_signing_key.pub | gpg --dearmor -o /etc/apt/keyrings/google-chrome.gpg && \
|
| 16 |
+
echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/google-chrome.gpg] http://dl.google.com/linux/chrome/deb/ stable main" > /etc/apt/sources.list.d/google-chrome.list) && \
|
| 17 |
+
apt-get update && apt-get install -y google-chrome-stable && \
|
| 18 |
+
rm -rf /var/lib/apt/lists/*
|
| 19 |
+
|
| 20 |
+
# Copy requirements first for better caching
|
| 21 |
+
COPY requirements.txt .
|
| 22 |
+
|
| 23 |
+
# Install dependencies
|
| 24 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 25 |
+
|
| 26 |
+
# Copy the entire project
|
| 27 |
+
COPY . .
|
| 28 |
+
|
| 29 |
+
# Expose port 7860 (Hugging Face Spaces default)
|
| 30 |
+
EXPOSE 7860
|
| 31 |
+
|
| 32 |
+
# Set environment variables
|
| 33 |
+
ENV PYTHONUNBUFFERED=1
|
| 34 |
+
|
| 35 |
+
# Run the app
|
| 36 |
+
CMD ["python", "app.py"]
|
README.md
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Antique Auth API
|
| 3 |
+
emoji: 🏆
|
| 4 |
+
colorFrom: red
|
| 5 |
+
colorTo: green
|
| 6 |
+
sdk: docker
|
| 7 |
+
pinned: false
|
| 8 |
+
short_description: Api used for my team project classes
|
| 9 |
+
---
|
| 10 |
+
|
| 11 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# app.py - Main entry point for Hugging Face Spaces
|
| 2 |
+
import os
|
| 3 |
+
import sys
|
| 4 |
+
import importlib.util
|
| 5 |
+
|
| 6 |
+
# Load the FastAPI app from code/app.py via importlib to avoid
|
| 7 |
+
# conflicts with the standard-library module named `code`.
|
| 8 |
+
HERE = os.path.dirname(__file__)
|
| 9 |
+
CODE_DIR = os.path.join(HERE, "code")
|
| 10 |
+
app_path = os.path.join(CODE_DIR, "app.py")
|
| 11 |
+
|
| 12 |
+
# Ensure the `code/` directory is on sys.path so relative imports like
|
| 13 |
+
# `from model import ...` inside `code/app.py` resolve correctly.
|
| 14 |
+
if CODE_DIR not in sys.path:
|
| 15 |
+
sys.path.insert(0, CODE_DIR)
|
| 16 |
+
|
| 17 |
+
spec = importlib.util.spec_from_file_location("antique_auth_code_app", app_path)
|
| 18 |
+
module = importlib.util.module_from_spec(spec)
|
| 19 |
+
spec.loader.exec_module(module)
|
| 20 |
+
|
| 21 |
+
# Optionally: remove CODE_DIR from sys.path after loading to avoid side effects
|
| 22 |
+
try:
|
| 23 |
+
# remove the first occurrence we added
|
| 24 |
+
if sys.path[0] == CODE_DIR:
|
| 25 |
+
sys.path.pop(0)
|
| 26 |
+
except Exception:
|
| 27 |
+
pass
|
| 28 |
+
|
| 29 |
+
# The FastAPI `app` object expected inside code/app.py
|
| 30 |
+
app = getattr(module, "app")
|
| 31 |
+
|
| 32 |
+
if __name__ == "__main__":
|
| 33 |
+
import uvicorn
|
| 34 |
+
uvicorn.run(app, host="0.0.0.0", port=7860)
|
code/app.py
ADDED
|
@@ -0,0 +1,216 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import FastAPI, UploadFile, Form, File
|
| 2 |
+
from fastapi.responses import JSONResponse
|
| 3 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 4 |
+
import torch
|
| 5 |
+
from PIL import Image
|
| 6 |
+
import io
|
| 7 |
+
from model import AuctionAuthenticityModel
|
| 8 |
+
from config import (
|
| 9 |
+
AUTHENTICITY_CLASSES,
|
| 10 |
+
CATEGORIES,
|
| 11 |
+
UNCERTAINTY_CONFIDENCE_THRESHOLD,
|
| 12 |
+
UNCERTAINTY_MARGIN_THRESHOLD,
|
| 13 |
+
UNCERTAIN_CATEGORY,
|
| 14 |
+
)
|
| 15 |
+
from torchvision import transforms
|
| 16 |
+
import os
|
| 17 |
+
import numpy as np
|
| 18 |
+
from huggingface_hub import hf_hub_download
|
| 19 |
+
|
| 20 |
+
app = FastAPI(
|
| 21 |
+
title="Antique Auction Authenticity API",
|
| 22 |
+
description="AI model for antique auction authenticity evaluation",
|
| 23 |
+
version="1.0.0",
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
app.add_middleware(
|
| 27 |
+
CORSMiddleware,
|
| 28 |
+
allow_origins=["*"],
|
| 29 |
+
allow_credentials=True,
|
| 30 |
+
allow_methods=["*"],
|
| 31 |
+
allow_headers=["*"],
|
| 32 |
+
)
|
| 33 |
+
|
| 34 |
+
DEVICE = torch.device("cpu")
|
| 35 |
+
|
| 36 |
+
MODEL_REPO_ID = os.getenv("MODEL_REPO_ID", "hatamo/auction-authenticity-model")
|
| 37 |
+
MODEL_FILENAME = "auction_model.pt" # whatever you pushed
|
| 38 |
+
|
| 39 |
+
authenticity_model = None
|
| 40 |
+
|
| 41 |
+
transform = transforms.Compose(
|
| 42 |
+
[
|
| 43 |
+
transforms.Resize((224, 224)),
|
| 44 |
+
transforms.ToTensor(),
|
| 45 |
+
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
|
| 46 |
+
]
|
| 47 |
+
)
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
@app.on_event("startup")
|
| 51 |
+
async def load_model():
|
| 52 |
+
global authenticity_model
|
| 53 |
+
print("🚀 Loading model...")
|
| 54 |
+
|
| 55 |
+
# download from HF Hub to /root/.cache/huggingface/hub/...
|
| 56 |
+
local_model_path = hf_hub_download(
|
| 57 |
+
repo_id=MODEL_REPO_ID,
|
| 58 |
+
filename=MODEL_FILENAME,
|
| 59 |
+
)
|
| 60 |
+
|
| 61 |
+
authenticity_model = AuctionAuthenticityModel(device=DEVICE).to(DEVICE)
|
| 62 |
+
state_dict = torch.load(local_model_path, map_location=DEVICE)
|
| 63 |
+
authenticity_model.load_state_dict(state_dict)
|
| 64 |
+
authenticity_model.eval()
|
| 65 |
+
print("✓ Model ready")
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
def predict_single(img_tensor, text):
|
| 69 |
+
with torch.no_grad():
|
| 70 |
+
outputs = authenticity_model(img_tensor, [text])
|
| 71 |
+
auth_probs = outputs["auth_probs"][0].cpu().numpy()
|
| 72 |
+
cat_probs = outputs["cat_probs"][0].cpu().numpy()
|
| 73 |
+
return auth_probs, cat_probs
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def build_verdict(probs, labels):
|
| 77 |
+
probs_dict = {labels[i]: float(probs[i]) for i in range(len(labels))}
|
| 78 |
+
best_label = max(probs_dict, key=probs_dict.get)
|
| 79 |
+
best_prob = probs_dict[best_label]
|
| 80 |
+
|
| 81 |
+
sorted_probs = sorted(probs_dict.values(), reverse=True)
|
| 82 |
+
margin = sorted_probs[0] - sorted_probs[1]
|
| 83 |
+
|
| 84 |
+
uncertain = (
|
| 85 |
+
best_prob < UNCERTAINTY_CONFIDENCE_THRESHOLD
|
| 86 |
+
or margin < UNCERTAINTY_MARGIN_THRESHOLD
|
| 87 |
+
)
|
| 88 |
+
|
| 89 |
+
return probs_dict, best_label, best_prob, margin, uncertain
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
@app.post("/validate_url")
|
| 94 |
+
async def validate_url(url: str = Form(...), max_images: int = Form(3)):
|
| 95 |
+
try:
|
| 96 |
+
from io import BytesIO
|
| 97 |
+
import requests
|
| 98 |
+
|
| 99 |
+
max_images = max(1, min(max_images, 10))
|
| 100 |
+
|
| 101 |
+
if "allegro.pl" in url:
|
| 102 |
+
from web_scraper_allegro import scrape_allegro_offer
|
| 103 |
+
|
| 104 |
+
auction = scrape_allegro_offer(url)
|
| 105 |
+
elif "olx.pl" in url:
|
| 106 |
+
from web_scraper_olx import scrape_olx_offer
|
| 107 |
+
|
| 108 |
+
auction = scrape_olx_offer(url)
|
| 109 |
+
elif "ebay." in url:
|
| 110 |
+
from web_scraper_ebay import scrape_ebay_offer
|
| 111 |
+
|
| 112 |
+
auction = scrape_ebay_offer(url)
|
| 113 |
+
else:
|
| 114 |
+
return JSONResponse({"error": "Unsupported platform"}, status_code=400)
|
| 115 |
+
|
| 116 |
+
if not auction.get("image_urls"):
|
| 117 |
+
return JSONResponse({"error": "No images"}, status_code=400)
|
| 118 |
+
|
| 119 |
+
images_to_use = min(max_images, len(auction["image_urls"]))
|
| 120 |
+
|
| 121 |
+
auth_probs_list = []
|
| 122 |
+
cat_probs_list = []
|
| 123 |
+
|
| 124 |
+
text = auction["title"] + " " + auction.get("description", "")
|
| 125 |
+
|
| 126 |
+
for img_url in auction["image_urls"][:images_to_use]:
|
| 127 |
+
img_resp = requests.get(img_url, timeout=15)
|
| 128 |
+
img_resp.raise_for_status()
|
| 129 |
+
|
| 130 |
+
img = Image.open(BytesIO(img_resp.content)).convert("RGB")
|
| 131 |
+
img_tensor = transform(img).unsqueeze(0).to(DEVICE)
|
| 132 |
+
|
| 133 |
+
auth_probs, cat_probs = predict_single(img_tensor, text)
|
| 134 |
+
|
| 135 |
+
auth_probs_list.append(auth_probs)
|
| 136 |
+
cat_probs_list.append(cat_probs)
|
| 137 |
+
|
| 138 |
+
avg_auth_probs = np.mean(auth_probs_list, axis=0)
|
| 139 |
+
avg_cat_probs = np.mean(cat_probs_list, axis=0)
|
| 140 |
+
|
| 141 |
+
auth_dict, best_auth, best_auth_prob, auth_margin, auth_uncertain = build_verdict(
|
| 142 |
+
avg_auth_probs, AUTHENTICITY_CLASSES
|
| 143 |
+
)
|
| 144 |
+
|
| 145 |
+
cat_dict, best_cat, best_cat_prob, cat_margin, cat_uncertain = build_verdict(
|
| 146 |
+
avg_cat_probs, CATEGORIES
|
| 147 |
+
)
|
| 148 |
+
|
| 149 |
+
auth_verdict = "UNCERTAIN" if auth_uncertain else best_auth
|
| 150 |
+
category_verdict = UNCERTAIN_CATEGORY if cat_uncertain else best_cat
|
| 151 |
+
|
| 152 |
+
return JSONResponse(
|
| 153 |
+
{
|
| 154 |
+
"status": "success",
|
| 155 |
+
"evaluation": {
|
| 156 |
+
"title": auction["title"],
|
| 157 |
+
"image_urls": auction["image_urls"][:images_to_use],
|
| 158 |
+
"price": auction["price"],
|
| 159 |
+
"category": None
|
| 160 |
+
if category_verdict == UNCERTAIN_CATEGORY
|
| 161 |
+
else category_verdict,
|
| 162 |
+
"evaluation_status": auth_verdict,
|
| 163 |
+
"confidence": round(best_auth_prob, 3),
|
| 164 |
+
},
|
| 165 |
+
"details": {
|
| 166 |
+
"url": url,
|
| 167 |
+
"platform": auction["platform"],
|
| 168 |
+
"image_count_used": images_to_use,
|
| 169 |
+
"authenticity": {
|
| 170 |
+
"verdict": auth_verdict,
|
| 171 |
+
"confidence": round(best_auth_prob, 3),
|
| 172 |
+
"margin": round(auth_margin, 3),
|
| 173 |
+
"probabilities": {
|
| 174 |
+
k: round(v, 3) for k, v in auth_dict.items()
|
| 175 |
+
},
|
| 176 |
+
},
|
| 177 |
+
"category": {
|
| 178 |
+
"verdict": category_verdict,
|
| 179 |
+
"label": best_cat,
|
| 180 |
+
"confidence": round(best_cat_prob, 3),
|
| 181 |
+
"margin": round(cat_margin, 3),
|
| 182 |
+
"probabilities": {
|
| 183 |
+
k: round(v, 3) for k, v in cat_dict.items()
|
| 184 |
+
},
|
| 185 |
+
},
|
| 186 |
+
},
|
| 187 |
+
}
|
| 188 |
+
)
|
| 189 |
+
|
| 190 |
+
except Exception as e:
|
| 191 |
+
import traceback
|
| 192 |
+
|
| 193 |
+
return JSONResponse(
|
| 194 |
+
{"status": "error", "error": str(e), "traceback": traceback.format_exc()},
|
| 195 |
+
status_code=500,
|
| 196 |
+
)
|
| 197 |
+
|
| 198 |
+
|
| 199 |
+
@app.get("/health")
|
| 200 |
+
def health():
|
| 201 |
+
return {"status": "ok", "message": "API running"}
|
| 202 |
+
|
| 203 |
+
|
| 204 |
+
@app.get("/")
|
| 205 |
+
def root():
|
| 206 |
+
return {
|
| 207 |
+
"name": "Antique Auction Authenticity API",
|
| 208 |
+
"version": "1.0.0",
|
| 209 |
+
"endpoints": {"POST /predict": "Evaluate auction", "GET /health": "Health check"},
|
| 210 |
+
}
|
| 211 |
+
|
| 212 |
+
|
| 213 |
+
if __name__ == "__main__":
|
| 214 |
+
import uvicorn
|
| 215 |
+
|
| 216 |
+
uvicorn.run(app, host="0.0.0.0", port=7860)
|
code/config.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# config.py
|
| 2 |
+
"""
|
| 3 |
+
Configuration for authenticity classes and categories
|
| 4 |
+
This allows elastic modification without code changes
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
# Authenticity Status (main model - requires retraining if changed)
|
| 8 |
+
AUTHENTICITY_CLASSES = {
|
| 9 |
+
0: "ORIGINAL",
|
| 10 |
+
1: "SCAM",
|
| 11 |
+
2: "REPLICA"
|
| 12 |
+
}
|
| 13 |
+
|
| 14 |
+
AUTHENTICITY_TO_ID = {v: k for k, v in AUTHENTICITY_CLASSES.items()}
|
| 15 |
+
|
| 16 |
+
# Category/Type (separate classifier - can be extended without retraining main model)
|
| 17 |
+
# 5 specific categories + UNCERTAIN added automatically when confidence is low
|
| 18 |
+
# Add/remove categories here as needed (edit these 5, UNCERTAIN is automatic)
|
| 19 |
+
CATEGORIES = {
|
| 20 |
+
0: "Clocks",
|
| 21 |
+
1: "Furniture",
|
| 22 |
+
2: "Numismatics",
|
| 23 |
+
3: "Sabers",
|
| 24 |
+
4: "Tableware"
|
| 25 |
+
}
|
| 26 |
+
|
| 27 |
+
CATEGORY_TO_ID = {v: k for k, v in CATEGORIES.items()}
|
| 28 |
+
|
| 29 |
+
# Special uncertainty category (added automatically, not in model output)
|
| 30 |
+
UNCERTAIN_CATEGORY = "Uncertain"
|
| 31 |
+
|
| 32 |
+
# Uncertainty thresholds
|
| 33 |
+
UNCERTAINTY_CONFIDENCE_THRESHOLD = 0.6
|
| 34 |
+
UNCERTAINTY_MARGIN_THRESHOLD = 0.15
|
code/model.py
ADDED
|
@@ -0,0 +1,158 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# model.py
|
| 2 |
+
import torch
|
| 3 |
+
import torch.nn as nn
|
| 4 |
+
from transformers import DistilBertTokenizer, DistilBertModel
|
| 5 |
+
from torchvision.models import efficientnet_b0
|
| 6 |
+
from config import AUTHENTICITY_CLASSES, CATEGORIES
|
| 7 |
+
|
| 8 |
+
class AuctionAuthenticityModel(nn.Module):
|
| 9 |
+
def __init__(self, num_classes=None, device='cpu'):
|
| 10 |
+
# If num_classes not specified, use config
|
| 11 |
+
if num_classes is None:
|
| 12 |
+
num_classes = len(AUTHENTICITY_CLASSES)
|
| 13 |
+
# Category classes (separate head)
|
| 14 |
+
num_categories = len(CATEGORIES)
|
| 15 |
+
super().__init__()
|
| 16 |
+
self.device = device
|
| 17 |
+
|
| 18 |
+
# Vision
|
| 19 |
+
self.vision_model = efficientnet_b0(pretrained=True)
|
| 20 |
+
self.vision_model.classifier = nn.Identity()
|
| 21 |
+
vision_out_dim = 1280
|
| 22 |
+
|
| 23 |
+
# Text
|
| 24 |
+
self.text_model = DistilBertModel.from_pretrained(
|
| 25 |
+
'distilbert-base-multilingual-cased'
|
| 26 |
+
)
|
| 27 |
+
text_out_dim = 768
|
| 28 |
+
|
| 29 |
+
self.tokenizer = DistilBertTokenizer.from_pretrained(
|
| 30 |
+
'distilbert-base-multilingual-cased'
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
# Fusion encoder (shared) -> then two heads (authenticity + category)
|
| 34 |
+
hidden_dim = 256
|
| 35 |
+
self.fusion_encoder = nn.Sequential(
|
| 36 |
+
nn.Linear(vision_out_dim + text_out_dim, hidden_dim),
|
| 37 |
+
nn.ReLU(),
|
| 38 |
+
nn.Dropout(0.3),
|
| 39 |
+
nn.Linear(hidden_dim, 128),
|
| 40 |
+
nn.ReLU(),
|
| 41 |
+
nn.Dropout(0.2),
|
| 42 |
+
)
|
| 43 |
+
|
| 44 |
+
# Heads
|
| 45 |
+
self.auth_head = nn.Linear(128, num_classes)
|
| 46 |
+
self.cat_head = nn.Linear(128, num_categories)
|
| 47 |
+
|
| 48 |
+
# store sizes for reference
|
| 49 |
+
self.num_classes = num_classes
|
| 50 |
+
self.num_categories = num_categories
|
| 51 |
+
|
| 52 |
+
def forward(self, images, texts):
|
| 53 |
+
vision_features = self.vision_model(images)
|
| 54 |
+
tokens = self.tokenizer(
|
| 55 |
+
texts, padding=True, truncation=True, max_length=512, return_tensors='pt'
|
| 56 |
+
).to(self.device)
|
| 57 |
+
text_outputs = self.text_model(**tokens)
|
| 58 |
+
text_features = text_outputs.last_hidden_state[:, 0, :]
|
| 59 |
+
|
| 60 |
+
combined = torch.cat([vision_features, text_features], dim=1)
|
| 61 |
+
shared = self.fusion_encoder(combined)
|
| 62 |
+
|
| 63 |
+
auth_logits = self.auth_head(shared)
|
| 64 |
+
cat_logits = self.cat_head(shared)
|
| 65 |
+
|
| 66 |
+
# probabilities
|
| 67 |
+
auth_probs = torch.softmax(auth_logits, dim=1)
|
| 68 |
+
cat_probs = torch.softmax(cat_logits, dim=1)
|
| 69 |
+
|
| 70 |
+
return {
|
| 71 |
+
'auth_logits': auth_logits,
|
| 72 |
+
'auth_probs': auth_probs,
|
| 73 |
+
'cat_logits': cat_logits,
|
| 74 |
+
'cat_probs': cat_probs,
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
def compute_loss(self, outputs, auth_labels=None, cat_labels=None, auth_weight=1.0, cat_weight=1.0):
|
| 78 |
+
"""Compute combined loss for two heads. Labels should be LongTensors on same device.
|
| 79 |
+
|
| 80 |
+
Returns combined scalar loss and a dict with individual losses.
|
| 81 |
+
"""
|
| 82 |
+
losses = {}
|
| 83 |
+
loss = 0.0
|
| 84 |
+
criterion = nn.CrossEntropyLoss()
|
| 85 |
+
|
| 86 |
+
if auth_labels is not None:
|
| 87 |
+
l_auth = criterion(outputs['auth_logits'], auth_labels)
|
| 88 |
+
losses['auth_loss'] = l_auth
|
| 89 |
+
loss = loss + auth_weight * l_auth
|
| 90 |
+
|
| 91 |
+
if cat_labels is not None:
|
| 92 |
+
# Allow sentinel -1 for unknown/uncertain categories and ignore them
|
| 93 |
+
if cat_labels.dim() == 1:
|
| 94 |
+
mask = cat_labels >= 0
|
| 95 |
+
else:
|
| 96 |
+
mask = (cat_labels.squeeze(-1) >= 0)
|
| 97 |
+
|
| 98 |
+
if mask.sum().item() > 0:
|
| 99 |
+
selected_logits = outputs['cat_logits'][mask]
|
| 100 |
+
selected_labels = cat_labels[mask]
|
| 101 |
+
l_cat = criterion(selected_logits, selected_labels)
|
| 102 |
+
losses['cat_loss'] = l_cat
|
| 103 |
+
loss = loss + cat_weight * l_cat
|
| 104 |
+
else:
|
| 105 |
+
# No valid category labels in batch
|
| 106 |
+
losses['cat_loss'] = torch.tensor(0.0, device=self.device)
|
| 107 |
+
|
| 108 |
+
return loss, losses
|
| 109 |
+
|
| 110 |
+
def count_parameters(self):
|
| 111 |
+
return sum(p.numel() for p in self.parameters() if p.requires_grad)
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
if __name__ == '__main__':
|
| 115 |
+
print("Testowanie modelu...")
|
| 116 |
+
|
| 117 |
+
device = torch.device('cpu')
|
| 118 |
+
model = AuctionAuthenticityModel(device=device).to(device)
|
| 119 |
+
|
| 120 |
+
print(f"✓ Model stworzony")
|
| 121 |
+
print(f" - Parametrów: {model.count_parameters():,}")
|
| 122 |
+
|
| 123 |
+
# Dummy test
|
| 124 |
+
dummy_img = torch.randn(2, 3, 224, 224).to(device)
|
| 125 |
+
dummy_texts = ["Silver spoon antique", "Polish silverware 19th century"]
|
| 126 |
+
|
| 127 |
+
with torch.no_grad():
|
| 128 |
+
output = model(dummy_img, dummy_texts)
|
| 129 |
+
|
| 130 |
+
# Print shapes
|
| 131 |
+
print("✓ Forward pass:")
|
| 132 |
+
print(f" - auth_logits: {output['auth_logits'].shape}")
|
| 133 |
+
print(f" - auth_probs: {output['auth_probs'].shape}")
|
| 134 |
+
print(f" - cat_logits: {output['cat_logits'].shape}")
|
| 135 |
+
print(f" - cat_probs: {output['cat_probs'].shape}")
|
| 136 |
+
|
| 137 |
+
# Show predicted labels and top probabilities
|
| 138 |
+
auth_pred = torch.argmax(output['auth_probs'], dim=1)
|
| 139 |
+
cat_pred = torch.argmax(output['cat_probs'], dim=1)
|
| 140 |
+
|
| 141 |
+
for i in range(output['auth_probs'].shape[0]):
|
| 142 |
+
a_idx = int(auth_pred[i].item())
|
| 143 |
+
a_prob = float(output['auth_probs'][i, a_idx].item())
|
| 144 |
+
c_idx = int(cat_pred[i].item())
|
| 145 |
+
c_prob = float(output['cat_probs'][i, c_idx].item())
|
| 146 |
+
a_name = AUTHENTICITY_CLASSES.get(a_idx, str(a_idx))
|
| 147 |
+
c_name = CATEGORIES.get(c_idx, str(c_idx))
|
| 148 |
+
print(f"\nSample {i}:")
|
| 149 |
+
print(f" - Authenticity: {a_name} ({a_prob:.3f})")
|
| 150 |
+
print(f" - Category: {c_name} ({c_prob:.3f})")
|
| 151 |
+
|
| 152 |
+
# Estimate model size
|
| 153 |
+
print(f"\n📊 Rozmiar modelu:")
|
| 154 |
+
torch.save(model.state_dict(), 'temp_model.pt')
|
| 155 |
+
import os
|
| 156 |
+
size_mb = os.path.getsize('temp_model.pt') / (1024*1024)
|
| 157 |
+
print(f" - {size_mb:.1f} MB")
|
| 158 |
+
os.remove('temp_model.pt')
|
code/web_scraper_allegro.py
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from apify_client import ApifyClient
|
| 2 |
+
import os
|
| 3 |
+
import re
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def sanitize_folder_name(text):
|
| 7 |
+
"""Helper function to sanitize folder names"""
|
| 8 |
+
polish_chars = {
|
| 9 |
+
"ą": "a", "ć": "c", "ę": "e", "ł": "l", "ń": "n",
|
| 10 |
+
"ó": "o", "ś": "s", "ź": "z", "ż": "z"
|
| 11 |
+
}
|
| 12 |
+
text = text.lower()
|
| 13 |
+
result = ""
|
| 14 |
+
for char in text:
|
| 15 |
+
if char in polish_chars:
|
| 16 |
+
result += polish_chars[char]
|
| 17 |
+
elif char.isalnum():
|
| 18 |
+
result += char
|
| 19 |
+
else:
|
| 20 |
+
result += "_"
|
| 21 |
+
while "__" in result:
|
| 22 |
+
result = result.replace("__", "_")
|
| 23 |
+
return result.strip("_")
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def extract_price(price_str):
|
| 27 |
+
"""Extract numeric price from various formats"""
|
| 28 |
+
if not price_str:
|
| 29 |
+
return None
|
| 30 |
+
match = re.search(r'(\d+[.,]\d{2}|\d+)', str(price_str))
|
| 31 |
+
if match:
|
| 32 |
+
return match.group(1).replace(',', '.')
|
| 33 |
+
return price_str
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def extract_images_from_apify(item_data):
|
| 37 |
+
"""Extract and normalize image URLs from Apify response"""
|
| 38 |
+
unique_links = set()
|
| 39 |
+
allowed_sizes = ["/s128/", "/s360/", "/s512/", "/s720/", "/s1024/", "/s1440/", "/original/"]
|
| 40 |
+
|
| 41 |
+
image_sources = []
|
| 42 |
+
if 'images' in item_data and item_data['images']:
|
| 43 |
+
if isinstance(item_data['images'], list):
|
| 44 |
+
image_sources.extend(item_data['images'])
|
| 45 |
+
else:
|
| 46 |
+
image_sources.append(item_data['images'])
|
| 47 |
+
|
| 48 |
+
if 'image' in item_data and item_data['image']:
|
| 49 |
+
image_sources.append(item_data['image'])
|
| 50 |
+
|
| 51 |
+
if 'imageUrl' in item_data and item_data['imageUrl']:
|
| 52 |
+
image_sources.append(item_data['imageUrl'])
|
| 53 |
+
|
| 54 |
+
for img_url in image_sources:
|
| 55 |
+
if img_url and isinstance(img_url, str):
|
| 56 |
+
if "allegroimg.com" in img_url or "img" in img_url:
|
| 57 |
+
for size in allowed_sizes:
|
| 58 |
+
img_url = img_url.replace(size, "/original/")
|
| 59 |
+
unique_links.add(img_url)
|
| 60 |
+
|
| 61 |
+
return list(unique_links)
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def scrape_allegro_offer(url: str):
|
| 65 |
+
"""Scrape single Allegro product using Apify E-commerce Tool"""
|
| 66 |
+
|
| 67 |
+
api_token = os.getenv('APIFY_API_TOKEN')
|
| 68 |
+
if not api_token:
|
| 69 |
+
raise ValueError("APIFY_API_TOKEN environment variable not set")
|
| 70 |
+
|
| 71 |
+
client = ApifyClient(api_token)
|
| 72 |
+
|
| 73 |
+
# Correct input format for E-commerce Scraping Tool
|
| 74 |
+
run_input = {
|
| 75 |
+
"startUrls": [
|
| 76 |
+
url
|
| 77 |
+
]
|
| 78 |
+
}
|
| 79 |
+
|
| 80 |
+
print(f"🔍 Scraping: {url}")
|
| 81 |
+
|
| 82 |
+
try:
|
| 83 |
+
actor_call = client.actor("e-commerce/allegro-product-detail-scraper").call(
|
| 84 |
+
run_input=run_input
|
| 85 |
+
)
|
| 86 |
+
dataset_client = client.dataset(actor_call['defaultDatasetId'])
|
| 87 |
+
items = list(dataset_client.iterate_items())
|
| 88 |
+
|
| 89 |
+
if not items:
|
| 90 |
+
print("⚠️ No data returned from Apify")
|
| 91 |
+
return {
|
| 92 |
+
"platform": "allegro",
|
| 93 |
+
"url": url,
|
| 94 |
+
"title": "untitled",
|
| 95 |
+
"description": "No description",
|
| 96 |
+
"price": None,
|
| 97 |
+
"image_urls": []
|
| 98 |
+
}
|
| 99 |
+
|
| 100 |
+
item = items[0]
|
| 101 |
+
print(f"✅ Success! Found: {item.get('productTitle', 'untitled')}")
|
| 102 |
+
|
| 103 |
+
image_urls = extract_images_from_apify(item)
|
| 104 |
+
|
| 105 |
+
if not image_urls:
|
| 106 |
+
thumbnail = item.get("thumbnail")
|
| 107 |
+
if thumbnail:
|
| 108 |
+
image_urls = [thumbnail]
|
| 109 |
+
|
| 110 |
+
return {
|
| 111 |
+
"platform": "allegro",
|
| 112 |
+
"url": item.get('url', url),
|
| 113 |
+
"title": item.get('productTitle', 'untitled').strip(),
|
| 114 |
+
"description": item.get('description', 'No description'),
|
| 115 |
+
"price": extract_price(item.get('price', item.get('currentPrice'))),
|
| 116 |
+
"image_urls": image_urls
|
| 117 |
+
}
|
| 118 |
+
|
| 119 |
+
except Exception as e:
|
| 120 |
+
print(f"❌ Error: {e}")
|
| 121 |
+
return {
|
| 122 |
+
"platform": "allegro",
|
| 123 |
+
"url": url,
|
| 124 |
+
"title": "error",
|
| 125 |
+
"description": str(e),
|
| 126 |
+
"price": None,
|
| 127 |
+
"image_urls": []
|
| 128 |
+
}
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
# Example usage
|
| 132 |
+
if __name__ == "__main__":
|
| 133 |
+
url = input("Allegro URL: ")
|
| 134 |
+
result = scrape_allegro_offer(url)
|
| 135 |
+
|
| 136 |
+
print("\n✅ Scraping result:")
|
| 137 |
+
print(f"Title: {result['title']}")
|
| 138 |
+
print(f"Price: {result['price']}")
|
| 139 |
+
print(f"Description: {result['description'][:100]}..." if len(result['description']) > 100 else f"Description: {result['description']}")
|
| 140 |
+
print(f"Images: {len(result['image_urls'])} found")
|
| 141 |
+
for img in result['image_urls'][:3]:
|
| 142 |
+
print(f" - {img}")
|
code/web_scraper_ebay.py
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# scrape_ebay_offer.py
|
| 2 |
+
import undetected_chromedriver as uc
|
| 3 |
+
from selenium.webdriver.common.by import By
|
| 4 |
+
from webdriver_manager.chrome import ChromeDriverManager
|
| 5 |
+
from selenium.webdriver.chrome.service import Service
|
| 6 |
+
import time
|
| 7 |
+
import requests
|
| 8 |
+
import os
|
| 9 |
+
|
| 10 |
+
def scrape_ebay_offer(url: str):
|
| 11 |
+
"""Zwraca dane aukcji bez zapisywania na dysk"""
|
| 12 |
+
print(f"🔍 eBay: {url}")
|
| 13 |
+
options = uc.ChromeOptions()
|
| 14 |
+
options.add_argument("--window-position=-3000,0")
|
| 15 |
+
options.add_argument("--headless")
|
| 16 |
+
options.add_argument("--no-sandbox")
|
| 17 |
+
options.add_argument("--disable-dev-shm-usage")
|
| 18 |
+
|
| 19 |
+
# Ustawienie binarki Chrome'a
|
| 20 |
+
if os.path.exists('/usr/bin/google-chrome'):
|
| 21 |
+
options.binary_location = '/usr/bin/google-chrome'
|
| 22 |
+
|
| 23 |
+
driver = uc.Chrome(
|
| 24 |
+
service=Service(ChromeDriverManager().install()),
|
| 25 |
+
options=options,
|
| 26 |
+
use_subprocess=True
|
| 27 |
+
)
|
| 28 |
+
|
| 29 |
+
try:
|
| 30 |
+
driver.get(url)
|
| 31 |
+
time.sleep(4)
|
| 32 |
+
|
| 33 |
+
# TITLE
|
| 34 |
+
try:
|
| 35 |
+
title_element = driver.find_element(By.CSS_SELECTOR, "h1.x-item-title__mainTitle")
|
| 36 |
+
title_str = title_element.text.strip()
|
| 37 |
+
except:
|
| 38 |
+
title_str = "untitled_ebay"
|
| 39 |
+
|
| 40 |
+
# PARAMETERS
|
| 41 |
+
parameter_list = []
|
| 42 |
+
try:
|
| 43 |
+
rows = driver.find_elements(By.CSS_SELECTOR, ".ux-labels-values")
|
| 44 |
+
for row in rows:
|
| 45 |
+
try:
|
| 46 |
+
label = row.find_element(By.CSS_SELECTOR, ".ux-labels-values__labels").text.strip()
|
| 47 |
+
value = row.find_element(By.CSS_SELECTOR, ".ux-labels-values__values").text.strip()
|
| 48 |
+
if label and value:
|
| 49 |
+
parameter_list.append(f"{label}: {value}")
|
| 50 |
+
except:
|
| 51 |
+
continue
|
| 52 |
+
except:
|
| 53 |
+
pass
|
| 54 |
+
|
| 55 |
+
# DESCRIPTION
|
| 56 |
+
description_content = "No description"
|
| 57 |
+
try:
|
| 58 |
+
frame = driver.find_element(By.ID, "desc_ifr")
|
| 59 |
+
driver.switch_to.frame(frame)
|
| 60 |
+
description_content = driver.find_element(By.TAG_NAME, "body").text.strip()
|
| 61 |
+
driver.switch_to.default_content()
|
| 62 |
+
except:
|
| 63 |
+
pass
|
| 64 |
+
|
| 65 |
+
# IMAGES
|
| 66 |
+
unique_links = set()
|
| 67 |
+
try:
|
| 68 |
+
thumbnails = driver.find_elements(By.CSS_SELECTOR, ".ux-image-grid-item img")
|
| 69 |
+
for img in thumbnails:
|
| 70 |
+
src = img.get_attribute("src") or img.get_attribute("data-src")
|
| 71 |
+
if src and "ebayimg.com" in src:
|
| 72 |
+
# Zamień na HD
|
| 73 |
+
hd_link = src.replace("/s-l64/", "/s-l1600").replace("/s-l140/", "/s-l1600")
|
| 74 |
+
unique_links.add(hd_link)
|
| 75 |
+
except:
|
| 76 |
+
pass
|
| 77 |
+
|
| 78 |
+
return {
|
| 79 |
+
"platform": "ebay",
|
| 80 |
+
"url": url,
|
| 81 |
+
"title": title_str,
|
| 82 |
+
"description": description_content,
|
| 83 |
+
"parameters": parameter_list,
|
| 84 |
+
"image_urls": list(unique_links)
|
| 85 |
+
}
|
| 86 |
+
|
| 87 |
+
finally:
|
| 88 |
+
driver.quit()
|
| 89 |
+
|
| 90 |
+
if __name__ == "__main__":
|
| 91 |
+
url = input("eBay URL: ")
|
| 92 |
+
result = scrape_ebay_offer(url)
|
| 93 |
+
print(result)
|
code/web_scraper_olx.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# scrape_olx_offer.py
|
| 2 |
+
import requests
|
| 3 |
+
from bs4 import BeautifulSoup
|
| 4 |
+
|
| 5 |
+
def scrape_olx_offer(url: str):
|
| 6 |
+
"""Zwraca dane aukcji bez zapisywania na dysk"""
|
| 7 |
+
headers = {
|
| 8 |
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36"
|
| 9 |
+
}
|
| 10 |
+
|
| 11 |
+
print(f"🔍 OLX: {url}")
|
| 12 |
+
response = requests.get(url, headers=headers)
|
| 13 |
+
|
| 14 |
+
if response.status_code != 200:
|
| 15 |
+
raise ValueError(f"OLX error: {response.status_code}")
|
| 16 |
+
|
| 17 |
+
soup = BeautifulSoup(response.content, "html.parser")
|
| 18 |
+
|
| 19 |
+
# TITLE
|
| 20 |
+
title_element = soup.find("h4", class_="css-1au435n")
|
| 21 |
+
title = title_element.get_text().strip() if title_element else "untitled"
|
| 22 |
+
|
| 23 |
+
# DESCRIPTION
|
| 24 |
+
description_element = soup.find("div", class_="css-19duwlz")
|
| 25 |
+
description = description_element.get_text(separator="\n").strip() if description_element else "No description"
|
| 26 |
+
|
| 27 |
+
# PARAMETERS
|
| 28 |
+
parameter_list = []
|
| 29 |
+
parameters_container = soup.find("div", attrs={"data-testid": "ad-parameters-container"})
|
| 30 |
+
if parameters_container:
|
| 31 |
+
params = parameters_container.find_all("p", class_="css-13x8d99")
|
| 32 |
+
for p in params:
|
| 33 |
+
parameter_list.append(p.get_text().strip())
|
| 34 |
+
|
| 35 |
+
# IMAGES
|
| 36 |
+
images = soup.select('img[data-testid^="swiper-image"]')
|
| 37 |
+
unique_links = set()
|
| 38 |
+
for img in images:
|
| 39 |
+
link = img.get("src")
|
| 40 |
+
if link:
|
| 41 |
+
unique_links.add(link)
|
| 42 |
+
|
| 43 |
+
return {
|
| 44 |
+
"platform": "olx",
|
| 45 |
+
"url": url,
|
| 46 |
+
"title": title,
|
| 47 |
+
"description": description,
|
| 48 |
+
"parameters": parameter_list,
|
| 49 |
+
"image_urls": list(unique_links)
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
if __name__ == "__main__":
|
| 53 |
+
url = input("OLX URL: ")
|
| 54 |
+
result = scrape_olx_offer(url)
|
| 55 |
+
print(result)
|
requirements.txt
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
torch
|
| 2 |
+
torchvision
|
| 3 |
+
transformers
|
| 4 |
+
pillow
|
| 5 |
+
numpy
|
| 6 |
+
scikit-learn
|
| 7 |
+
tqdm
|
| 8 |
+
fastapi
|
| 9 |
+
uvicorn
|
| 10 |
+
python-multipart
|
| 11 |
+
undetected_chromedriver
|
| 12 |
+
webdriver-manager
|
| 13 |
+
bs4
|
| 14 |
+
requests
|
| 15 |
+
flask
|
| 16 |
+
selenium>=4.0
|
| 17 |
+
huggingface_hub
|