hatamo commited on
Commit
422c1f3
·
0 Parent(s):

Fresh repo

Browse files
Dockerfile ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10
2
+
3
+ WORKDIR /app
4
+
5
+ # Install system dependencies for Chrome
6
+ RUN apt-get update && apt-get install -y \
7
+ wget \
8
+ curl \
9
+ unzip \
10
+ && rm -rf /var/lib/apt/lists/*
11
+
12
+ # Add Google Chrome repository and install Chrome (modern approach without apt-key)
13
+ RUN curl https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - 2>/dev/null || \
14
+ (mkdir -p /etc/apt/keyrings && \
15
+ curl -fsSL https://dl-ssl.google.com/linux/linux_signing_key.pub | gpg --dearmor -o /etc/apt/keyrings/google-chrome.gpg && \
16
+ echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/google-chrome.gpg] http://dl.google.com/linux/chrome/deb/ stable main" > /etc/apt/sources.list.d/google-chrome.list) && \
17
+ apt-get update && apt-get install -y google-chrome-stable && \
18
+ rm -rf /var/lib/apt/lists/*
19
+
20
+ # Copy requirements first for better caching
21
+ COPY requirements.txt .
22
+
23
+ # Install dependencies
24
+ RUN pip install --no-cache-dir -r requirements.txt
25
+
26
+ # Copy the entire project
27
+ COPY . .
28
+
29
+ # Expose port 7860 (Hugging Face Spaces default)
30
+ EXPOSE 7860
31
+
32
+ # Set environment variables
33
+ ENV PYTHONUNBUFFERED=1
34
+
35
+ # Run the app
36
+ CMD ["python", "app.py"]
README.md ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Antique Auth API
3
+ emoji: 🏆
4
+ colorFrom: red
5
+ colorTo: green
6
+ sdk: docker
7
+ pinned: false
8
+ short_description: Api used for my team project classes
9
+ ---
10
+
11
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py - Main entry point for Hugging Face Spaces
2
+ import os
3
+ import sys
4
+ import importlib.util
5
+
6
+ # Load the FastAPI app from code/app.py via importlib to avoid
7
+ # conflicts with the standard-library module named `code`.
8
+ HERE = os.path.dirname(__file__)
9
+ CODE_DIR = os.path.join(HERE, "code")
10
+ app_path = os.path.join(CODE_DIR, "app.py")
11
+
12
+ # Ensure the `code/` directory is on sys.path so relative imports like
13
+ # `from model import ...` inside `code/app.py` resolve correctly.
14
+ if CODE_DIR not in sys.path:
15
+ sys.path.insert(0, CODE_DIR)
16
+
17
+ spec = importlib.util.spec_from_file_location("antique_auth_code_app", app_path)
18
+ module = importlib.util.module_from_spec(spec)
19
+ spec.loader.exec_module(module)
20
+
21
+ # Optionally: remove CODE_DIR from sys.path after loading to avoid side effects
22
+ try:
23
+ # remove the first occurrence we added
24
+ if sys.path[0] == CODE_DIR:
25
+ sys.path.pop(0)
26
+ except Exception:
27
+ pass
28
+
29
+ # The FastAPI `app` object expected inside code/app.py
30
+ app = getattr(module, "app")
31
+
32
+ if __name__ == "__main__":
33
+ import uvicorn
34
+ uvicorn.run(app, host="0.0.0.0", port=7860)
code/app.py ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, UploadFile, Form, File
2
+ from fastapi.responses import JSONResponse
3
+ from fastapi.middleware.cors import CORSMiddleware
4
+ import torch
5
+ from PIL import Image
6
+ import io
7
+ from model import AuctionAuthenticityModel
8
+ from config import (
9
+ AUTHENTICITY_CLASSES,
10
+ CATEGORIES,
11
+ UNCERTAINTY_CONFIDENCE_THRESHOLD,
12
+ UNCERTAINTY_MARGIN_THRESHOLD,
13
+ UNCERTAIN_CATEGORY,
14
+ )
15
+ from torchvision import transforms
16
+ import os
17
+ import numpy as np
18
+ from huggingface_hub import hf_hub_download
19
+
20
+ app = FastAPI(
21
+ title="Antique Auction Authenticity API",
22
+ description="AI model for antique auction authenticity evaluation",
23
+ version="1.0.0",
24
+ )
25
+
26
+ app.add_middleware(
27
+ CORSMiddleware,
28
+ allow_origins=["*"],
29
+ allow_credentials=True,
30
+ allow_methods=["*"],
31
+ allow_headers=["*"],
32
+ )
33
+
34
+ DEVICE = torch.device("cpu")
35
+
36
+ MODEL_REPO_ID = os.getenv("MODEL_REPO_ID", "hatamo/auction-authenticity-model")
37
+ MODEL_FILENAME = "auction_model.pt" # whatever you pushed
38
+
39
+ authenticity_model = None
40
+
41
+ transform = transforms.Compose(
42
+ [
43
+ transforms.Resize((224, 224)),
44
+ transforms.ToTensor(),
45
+ transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
46
+ ]
47
+ )
48
+
49
+
50
+ @app.on_event("startup")
51
+ async def load_model():
52
+ global authenticity_model
53
+ print("🚀 Loading model...")
54
+
55
+ # download from HF Hub to /root/.cache/huggingface/hub/...
56
+ local_model_path = hf_hub_download(
57
+ repo_id=MODEL_REPO_ID,
58
+ filename=MODEL_FILENAME,
59
+ )
60
+
61
+ authenticity_model = AuctionAuthenticityModel(device=DEVICE).to(DEVICE)
62
+ state_dict = torch.load(local_model_path, map_location=DEVICE)
63
+ authenticity_model.load_state_dict(state_dict)
64
+ authenticity_model.eval()
65
+ print("✓ Model ready")
66
+
67
+
68
+ def predict_single(img_tensor, text):
69
+ with torch.no_grad():
70
+ outputs = authenticity_model(img_tensor, [text])
71
+ auth_probs = outputs["auth_probs"][0].cpu().numpy()
72
+ cat_probs = outputs["cat_probs"][0].cpu().numpy()
73
+ return auth_probs, cat_probs
74
+
75
+
76
+ def build_verdict(probs, labels):
77
+ probs_dict = {labels[i]: float(probs[i]) for i in range(len(labels))}
78
+ best_label = max(probs_dict, key=probs_dict.get)
79
+ best_prob = probs_dict[best_label]
80
+
81
+ sorted_probs = sorted(probs_dict.values(), reverse=True)
82
+ margin = sorted_probs[0] - sorted_probs[1]
83
+
84
+ uncertain = (
85
+ best_prob < UNCERTAINTY_CONFIDENCE_THRESHOLD
86
+ or margin < UNCERTAINTY_MARGIN_THRESHOLD
87
+ )
88
+
89
+ return probs_dict, best_label, best_prob, margin, uncertain
90
+
91
+
92
+
93
+ @app.post("/validate_url")
94
+ async def validate_url(url: str = Form(...), max_images: int = Form(3)):
95
+ try:
96
+ from io import BytesIO
97
+ import requests
98
+
99
+ max_images = max(1, min(max_images, 10))
100
+
101
+ if "allegro.pl" in url:
102
+ from web_scraper_allegro import scrape_allegro_offer
103
+
104
+ auction = scrape_allegro_offer(url)
105
+ elif "olx.pl" in url:
106
+ from web_scraper_olx import scrape_olx_offer
107
+
108
+ auction = scrape_olx_offer(url)
109
+ elif "ebay." in url:
110
+ from web_scraper_ebay import scrape_ebay_offer
111
+
112
+ auction = scrape_ebay_offer(url)
113
+ else:
114
+ return JSONResponse({"error": "Unsupported platform"}, status_code=400)
115
+
116
+ if not auction.get("image_urls"):
117
+ return JSONResponse({"error": "No images"}, status_code=400)
118
+
119
+ images_to_use = min(max_images, len(auction["image_urls"]))
120
+
121
+ auth_probs_list = []
122
+ cat_probs_list = []
123
+
124
+ text = auction["title"] + " " + auction.get("description", "")
125
+
126
+ for img_url in auction["image_urls"][:images_to_use]:
127
+ img_resp = requests.get(img_url, timeout=15)
128
+ img_resp.raise_for_status()
129
+
130
+ img = Image.open(BytesIO(img_resp.content)).convert("RGB")
131
+ img_tensor = transform(img).unsqueeze(0).to(DEVICE)
132
+
133
+ auth_probs, cat_probs = predict_single(img_tensor, text)
134
+
135
+ auth_probs_list.append(auth_probs)
136
+ cat_probs_list.append(cat_probs)
137
+
138
+ avg_auth_probs = np.mean(auth_probs_list, axis=0)
139
+ avg_cat_probs = np.mean(cat_probs_list, axis=0)
140
+
141
+ auth_dict, best_auth, best_auth_prob, auth_margin, auth_uncertain = build_verdict(
142
+ avg_auth_probs, AUTHENTICITY_CLASSES
143
+ )
144
+
145
+ cat_dict, best_cat, best_cat_prob, cat_margin, cat_uncertain = build_verdict(
146
+ avg_cat_probs, CATEGORIES
147
+ )
148
+
149
+ auth_verdict = "UNCERTAIN" if auth_uncertain else best_auth
150
+ category_verdict = UNCERTAIN_CATEGORY if cat_uncertain else best_cat
151
+
152
+ return JSONResponse(
153
+ {
154
+ "status": "success",
155
+ "evaluation": {
156
+ "title": auction["title"],
157
+ "image_urls": auction["image_urls"][:images_to_use],
158
+ "price": auction["price"],
159
+ "category": None
160
+ if category_verdict == UNCERTAIN_CATEGORY
161
+ else category_verdict,
162
+ "evaluation_status": auth_verdict,
163
+ "confidence": round(best_auth_prob, 3),
164
+ },
165
+ "details": {
166
+ "url": url,
167
+ "platform": auction["platform"],
168
+ "image_count_used": images_to_use,
169
+ "authenticity": {
170
+ "verdict": auth_verdict,
171
+ "confidence": round(best_auth_prob, 3),
172
+ "margin": round(auth_margin, 3),
173
+ "probabilities": {
174
+ k: round(v, 3) for k, v in auth_dict.items()
175
+ },
176
+ },
177
+ "category": {
178
+ "verdict": category_verdict,
179
+ "label": best_cat,
180
+ "confidence": round(best_cat_prob, 3),
181
+ "margin": round(cat_margin, 3),
182
+ "probabilities": {
183
+ k: round(v, 3) for k, v in cat_dict.items()
184
+ },
185
+ },
186
+ },
187
+ }
188
+ )
189
+
190
+ except Exception as e:
191
+ import traceback
192
+
193
+ return JSONResponse(
194
+ {"status": "error", "error": str(e), "traceback": traceback.format_exc()},
195
+ status_code=500,
196
+ )
197
+
198
+
199
+ @app.get("/health")
200
+ def health():
201
+ return {"status": "ok", "message": "API running"}
202
+
203
+
204
+ @app.get("/")
205
+ def root():
206
+ return {
207
+ "name": "Antique Auction Authenticity API",
208
+ "version": "1.0.0",
209
+ "endpoints": {"POST /predict": "Evaluate auction", "GET /health": "Health check"},
210
+ }
211
+
212
+
213
+ if __name__ == "__main__":
214
+ import uvicorn
215
+
216
+ uvicorn.run(app, host="0.0.0.0", port=7860)
code/config.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # config.py
2
+ """
3
+ Configuration for authenticity classes and categories
4
+ This allows elastic modification without code changes
5
+ """
6
+
7
+ # Authenticity Status (main model - requires retraining if changed)
8
+ AUTHENTICITY_CLASSES = {
9
+ 0: "ORIGINAL",
10
+ 1: "SCAM",
11
+ 2: "REPLICA"
12
+ }
13
+
14
+ AUTHENTICITY_TO_ID = {v: k for k, v in AUTHENTICITY_CLASSES.items()}
15
+
16
+ # Category/Type (separate classifier - can be extended without retraining main model)
17
+ # 5 specific categories + UNCERTAIN added automatically when confidence is low
18
+ # Add/remove categories here as needed (edit these 5, UNCERTAIN is automatic)
19
+ CATEGORIES = {
20
+ 0: "Clocks",
21
+ 1: "Furniture",
22
+ 2: "Numismatics",
23
+ 3: "Sabers",
24
+ 4: "Tableware"
25
+ }
26
+
27
+ CATEGORY_TO_ID = {v: k for k, v in CATEGORIES.items()}
28
+
29
+ # Special uncertainty category (added automatically, not in model output)
30
+ UNCERTAIN_CATEGORY = "Uncertain"
31
+
32
+ # Uncertainty thresholds
33
+ UNCERTAINTY_CONFIDENCE_THRESHOLD = 0.6
34
+ UNCERTAINTY_MARGIN_THRESHOLD = 0.15
code/model.py ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model.py
2
+ import torch
3
+ import torch.nn as nn
4
+ from transformers import DistilBertTokenizer, DistilBertModel
5
+ from torchvision.models import efficientnet_b0
6
+ from config import AUTHENTICITY_CLASSES, CATEGORIES
7
+
8
+ class AuctionAuthenticityModel(nn.Module):
9
+ def __init__(self, num_classes=None, device='cpu'):
10
+ # If num_classes not specified, use config
11
+ if num_classes is None:
12
+ num_classes = len(AUTHENTICITY_CLASSES)
13
+ # Category classes (separate head)
14
+ num_categories = len(CATEGORIES)
15
+ super().__init__()
16
+ self.device = device
17
+
18
+ # Vision
19
+ self.vision_model = efficientnet_b0(pretrained=True)
20
+ self.vision_model.classifier = nn.Identity()
21
+ vision_out_dim = 1280
22
+
23
+ # Text
24
+ self.text_model = DistilBertModel.from_pretrained(
25
+ 'distilbert-base-multilingual-cased'
26
+ )
27
+ text_out_dim = 768
28
+
29
+ self.tokenizer = DistilBertTokenizer.from_pretrained(
30
+ 'distilbert-base-multilingual-cased'
31
+ )
32
+
33
+ # Fusion encoder (shared) -> then two heads (authenticity + category)
34
+ hidden_dim = 256
35
+ self.fusion_encoder = nn.Sequential(
36
+ nn.Linear(vision_out_dim + text_out_dim, hidden_dim),
37
+ nn.ReLU(),
38
+ nn.Dropout(0.3),
39
+ nn.Linear(hidden_dim, 128),
40
+ nn.ReLU(),
41
+ nn.Dropout(0.2),
42
+ )
43
+
44
+ # Heads
45
+ self.auth_head = nn.Linear(128, num_classes)
46
+ self.cat_head = nn.Linear(128, num_categories)
47
+
48
+ # store sizes for reference
49
+ self.num_classes = num_classes
50
+ self.num_categories = num_categories
51
+
52
+ def forward(self, images, texts):
53
+ vision_features = self.vision_model(images)
54
+ tokens = self.tokenizer(
55
+ texts, padding=True, truncation=True, max_length=512, return_tensors='pt'
56
+ ).to(self.device)
57
+ text_outputs = self.text_model(**tokens)
58
+ text_features = text_outputs.last_hidden_state[:, 0, :]
59
+
60
+ combined = torch.cat([vision_features, text_features], dim=1)
61
+ shared = self.fusion_encoder(combined)
62
+
63
+ auth_logits = self.auth_head(shared)
64
+ cat_logits = self.cat_head(shared)
65
+
66
+ # probabilities
67
+ auth_probs = torch.softmax(auth_logits, dim=1)
68
+ cat_probs = torch.softmax(cat_logits, dim=1)
69
+
70
+ return {
71
+ 'auth_logits': auth_logits,
72
+ 'auth_probs': auth_probs,
73
+ 'cat_logits': cat_logits,
74
+ 'cat_probs': cat_probs,
75
+ }
76
+
77
+ def compute_loss(self, outputs, auth_labels=None, cat_labels=None, auth_weight=1.0, cat_weight=1.0):
78
+ """Compute combined loss for two heads. Labels should be LongTensors on same device.
79
+
80
+ Returns combined scalar loss and a dict with individual losses.
81
+ """
82
+ losses = {}
83
+ loss = 0.0
84
+ criterion = nn.CrossEntropyLoss()
85
+
86
+ if auth_labels is not None:
87
+ l_auth = criterion(outputs['auth_logits'], auth_labels)
88
+ losses['auth_loss'] = l_auth
89
+ loss = loss + auth_weight * l_auth
90
+
91
+ if cat_labels is not None:
92
+ # Allow sentinel -1 for unknown/uncertain categories and ignore them
93
+ if cat_labels.dim() == 1:
94
+ mask = cat_labels >= 0
95
+ else:
96
+ mask = (cat_labels.squeeze(-1) >= 0)
97
+
98
+ if mask.sum().item() > 0:
99
+ selected_logits = outputs['cat_logits'][mask]
100
+ selected_labels = cat_labels[mask]
101
+ l_cat = criterion(selected_logits, selected_labels)
102
+ losses['cat_loss'] = l_cat
103
+ loss = loss + cat_weight * l_cat
104
+ else:
105
+ # No valid category labels in batch
106
+ losses['cat_loss'] = torch.tensor(0.0, device=self.device)
107
+
108
+ return loss, losses
109
+
110
+ def count_parameters(self):
111
+ return sum(p.numel() for p in self.parameters() if p.requires_grad)
112
+
113
+
114
+ if __name__ == '__main__':
115
+ print("Testowanie modelu...")
116
+
117
+ device = torch.device('cpu')
118
+ model = AuctionAuthenticityModel(device=device).to(device)
119
+
120
+ print(f"✓ Model stworzony")
121
+ print(f" - Parametrów: {model.count_parameters():,}")
122
+
123
+ # Dummy test
124
+ dummy_img = torch.randn(2, 3, 224, 224).to(device)
125
+ dummy_texts = ["Silver spoon antique", "Polish silverware 19th century"]
126
+
127
+ with torch.no_grad():
128
+ output = model(dummy_img, dummy_texts)
129
+
130
+ # Print shapes
131
+ print("✓ Forward pass:")
132
+ print(f" - auth_logits: {output['auth_logits'].shape}")
133
+ print(f" - auth_probs: {output['auth_probs'].shape}")
134
+ print(f" - cat_logits: {output['cat_logits'].shape}")
135
+ print(f" - cat_probs: {output['cat_probs'].shape}")
136
+
137
+ # Show predicted labels and top probabilities
138
+ auth_pred = torch.argmax(output['auth_probs'], dim=1)
139
+ cat_pred = torch.argmax(output['cat_probs'], dim=1)
140
+
141
+ for i in range(output['auth_probs'].shape[0]):
142
+ a_idx = int(auth_pred[i].item())
143
+ a_prob = float(output['auth_probs'][i, a_idx].item())
144
+ c_idx = int(cat_pred[i].item())
145
+ c_prob = float(output['cat_probs'][i, c_idx].item())
146
+ a_name = AUTHENTICITY_CLASSES.get(a_idx, str(a_idx))
147
+ c_name = CATEGORIES.get(c_idx, str(c_idx))
148
+ print(f"\nSample {i}:")
149
+ print(f" - Authenticity: {a_name} ({a_prob:.3f})")
150
+ print(f" - Category: {c_name} ({c_prob:.3f})")
151
+
152
+ # Estimate model size
153
+ print(f"\n📊 Rozmiar modelu:")
154
+ torch.save(model.state_dict(), 'temp_model.pt')
155
+ import os
156
+ size_mb = os.path.getsize('temp_model.pt') / (1024*1024)
157
+ print(f" - {size_mb:.1f} MB")
158
+ os.remove('temp_model.pt')
code/web_scraper_allegro.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from apify_client import ApifyClient
2
+ import os
3
+ import re
4
+
5
+
6
+ def sanitize_folder_name(text):
7
+ """Helper function to sanitize folder names"""
8
+ polish_chars = {
9
+ "ą": "a", "ć": "c", "ę": "e", "ł": "l", "ń": "n",
10
+ "ó": "o", "ś": "s", "ź": "z", "ż": "z"
11
+ }
12
+ text = text.lower()
13
+ result = ""
14
+ for char in text:
15
+ if char in polish_chars:
16
+ result += polish_chars[char]
17
+ elif char.isalnum():
18
+ result += char
19
+ else:
20
+ result += "_"
21
+ while "__" in result:
22
+ result = result.replace("__", "_")
23
+ return result.strip("_")
24
+
25
+
26
+ def extract_price(price_str):
27
+ """Extract numeric price from various formats"""
28
+ if not price_str:
29
+ return None
30
+ match = re.search(r'(\d+[.,]\d{2}|\d+)', str(price_str))
31
+ if match:
32
+ return match.group(1).replace(',', '.')
33
+ return price_str
34
+
35
+
36
+ def extract_images_from_apify(item_data):
37
+ """Extract and normalize image URLs from Apify response"""
38
+ unique_links = set()
39
+ allowed_sizes = ["/s128/", "/s360/", "/s512/", "/s720/", "/s1024/", "/s1440/", "/original/"]
40
+
41
+ image_sources = []
42
+ if 'images' in item_data and item_data['images']:
43
+ if isinstance(item_data['images'], list):
44
+ image_sources.extend(item_data['images'])
45
+ else:
46
+ image_sources.append(item_data['images'])
47
+
48
+ if 'image' in item_data and item_data['image']:
49
+ image_sources.append(item_data['image'])
50
+
51
+ if 'imageUrl' in item_data and item_data['imageUrl']:
52
+ image_sources.append(item_data['imageUrl'])
53
+
54
+ for img_url in image_sources:
55
+ if img_url and isinstance(img_url, str):
56
+ if "allegroimg.com" in img_url or "img" in img_url:
57
+ for size in allowed_sizes:
58
+ img_url = img_url.replace(size, "/original/")
59
+ unique_links.add(img_url)
60
+
61
+ return list(unique_links)
62
+
63
+
64
+ def scrape_allegro_offer(url: str):
65
+ """Scrape single Allegro product using Apify E-commerce Tool"""
66
+
67
+ api_token = os.getenv('APIFY_API_TOKEN')
68
+ if not api_token:
69
+ raise ValueError("APIFY_API_TOKEN environment variable not set")
70
+
71
+ client = ApifyClient(api_token)
72
+
73
+ # Correct input format for E-commerce Scraping Tool
74
+ run_input = {
75
+ "startUrls": [
76
+ url
77
+ ]
78
+ }
79
+
80
+ print(f"🔍 Scraping: {url}")
81
+
82
+ try:
83
+ actor_call = client.actor("e-commerce/allegro-product-detail-scraper").call(
84
+ run_input=run_input
85
+ )
86
+ dataset_client = client.dataset(actor_call['defaultDatasetId'])
87
+ items = list(dataset_client.iterate_items())
88
+
89
+ if not items:
90
+ print("⚠️ No data returned from Apify")
91
+ return {
92
+ "platform": "allegro",
93
+ "url": url,
94
+ "title": "untitled",
95
+ "description": "No description",
96
+ "price": None,
97
+ "image_urls": []
98
+ }
99
+
100
+ item = items[0]
101
+ print(f"✅ Success! Found: {item.get('productTitle', 'untitled')}")
102
+
103
+ image_urls = extract_images_from_apify(item)
104
+
105
+ if not image_urls:
106
+ thumbnail = item.get("thumbnail")
107
+ if thumbnail:
108
+ image_urls = [thumbnail]
109
+
110
+ return {
111
+ "platform": "allegro",
112
+ "url": item.get('url', url),
113
+ "title": item.get('productTitle', 'untitled').strip(),
114
+ "description": item.get('description', 'No description'),
115
+ "price": extract_price(item.get('price', item.get('currentPrice'))),
116
+ "image_urls": image_urls
117
+ }
118
+
119
+ except Exception as e:
120
+ print(f"❌ Error: {e}")
121
+ return {
122
+ "platform": "allegro",
123
+ "url": url,
124
+ "title": "error",
125
+ "description": str(e),
126
+ "price": None,
127
+ "image_urls": []
128
+ }
129
+
130
+
131
+ # Example usage
132
+ if __name__ == "__main__":
133
+ url = input("Allegro URL: ")
134
+ result = scrape_allegro_offer(url)
135
+
136
+ print("\n✅ Scraping result:")
137
+ print(f"Title: {result['title']}")
138
+ print(f"Price: {result['price']}")
139
+ print(f"Description: {result['description'][:100]}..." if len(result['description']) > 100 else f"Description: {result['description']}")
140
+ print(f"Images: {len(result['image_urls'])} found")
141
+ for img in result['image_urls'][:3]:
142
+ print(f" - {img}")
code/web_scraper_ebay.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # scrape_ebay_offer.py
2
+ import undetected_chromedriver as uc
3
+ from selenium.webdriver.common.by import By
4
+ from webdriver_manager.chrome import ChromeDriverManager
5
+ from selenium.webdriver.chrome.service import Service
6
+ import time
7
+ import requests
8
+ import os
9
+
10
+ def scrape_ebay_offer(url: str):
11
+ """Zwraca dane aukcji bez zapisywania na dysk"""
12
+ print(f"🔍 eBay: {url}")
13
+ options = uc.ChromeOptions()
14
+ options.add_argument("--window-position=-3000,0")
15
+ options.add_argument("--headless")
16
+ options.add_argument("--no-sandbox")
17
+ options.add_argument("--disable-dev-shm-usage")
18
+
19
+ # Ustawienie binarki Chrome'a
20
+ if os.path.exists('/usr/bin/google-chrome'):
21
+ options.binary_location = '/usr/bin/google-chrome'
22
+
23
+ driver = uc.Chrome(
24
+ service=Service(ChromeDriverManager().install()),
25
+ options=options,
26
+ use_subprocess=True
27
+ )
28
+
29
+ try:
30
+ driver.get(url)
31
+ time.sleep(4)
32
+
33
+ # TITLE
34
+ try:
35
+ title_element = driver.find_element(By.CSS_SELECTOR, "h1.x-item-title__mainTitle")
36
+ title_str = title_element.text.strip()
37
+ except:
38
+ title_str = "untitled_ebay"
39
+
40
+ # PARAMETERS
41
+ parameter_list = []
42
+ try:
43
+ rows = driver.find_elements(By.CSS_SELECTOR, ".ux-labels-values")
44
+ for row in rows:
45
+ try:
46
+ label = row.find_element(By.CSS_SELECTOR, ".ux-labels-values__labels").text.strip()
47
+ value = row.find_element(By.CSS_SELECTOR, ".ux-labels-values__values").text.strip()
48
+ if label and value:
49
+ parameter_list.append(f"{label}: {value}")
50
+ except:
51
+ continue
52
+ except:
53
+ pass
54
+
55
+ # DESCRIPTION
56
+ description_content = "No description"
57
+ try:
58
+ frame = driver.find_element(By.ID, "desc_ifr")
59
+ driver.switch_to.frame(frame)
60
+ description_content = driver.find_element(By.TAG_NAME, "body").text.strip()
61
+ driver.switch_to.default_content()
62
+ except:
63
+ pass
64
+
65
+ # IMAGES
66
+ unique_links = set()
67
+ try:
68
+ thumbnails = driver.find_elements(By.CSS_SELECTOR, ".ux-image-grid-item img")
69
+ for img in thumbnails:
70
+ src = img.get_attribute("src") or img.get_attribute("data-src")
71
+ if src and "ebayimg.com" in src:
72
+ # Zamień na HD
73
+ hd_link = src.replace("/s-l64/", "/s-l1600").replace("/s-l140/", "/s-l1600")
74
+ unique_links.add(hd_link)
75
+ except:
76
+ pass
77
+
78
+ return {
79
+ "platform": "ebay",
80
+ "url": url,
81
+ "title": title_str,
82
+ "description": description_content,
83
+ "parameters": parameter_list,
84
+ "image_urls": list(unique_links)
85
+ }
86
+
87
+ finally:
88
+ driver.quit()
89
+
90
+ if __name__ == "__main__":
91
+ url = input("eBay URL: ")
92
+ result = scrape_ebay_offer(url)
93
+ print(result)
code/web_scraper_olx.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # scrape_olx_offer.py
2
+ import requests
3
+ from bs4 import BeautifulSoup
4
+
5
+ def scrape_olx_offer(url: str):
6
+ """Zwraca dane aukcji bez zapisywania na dysk"""
7
+ headers = {
8
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36"
9
+ }
10
+
11
+ print(f"🔍 OLX: {url}")
12
+ response = requests.get(url, headers=headers)
13
+
14
+ if response.status_code != 200:
15
+ raise ValueError(f"OLX error: {response.status_code}")
16
+
17
+ soup = BeautifulSoup(response.content, "html.parser")
18
+
19
+ # TITLE
20
+ title_element = soup.find("h4", class_="css-1au435n")
21
+ title = title_element.get_text().strip() if title_element else "untitled"
22
+
23
+ # DESCRIPTION
24
+ description_element = soup.find("div", class_="css-19duwlz")
25
+ description = description_element.get_text(separator="\n").strip() if description_element else "No description"
26
+
27
+ # PARAMETERS
28
+ parameter_list = []
29
+ parameters_container = soup.find("div", attrs={"data-testid": "ad-parameters-container"})
30
+ if parameters_container:
31
+ params = parameters_container.find_all("p", class_="css-13x8d99")
32
+ for p in params:
33
+ parameter_list.append(p.get_text().strip())
34
+
35
+ # IMAGES
36
+ images = soup.select('img[data-testid^="swiper-image"]')
37
+ unique_links = set()
38
+ for img in images:
39
+ link = img.get("src")
40
+ if link:
41
+ unique_links.add(link)
42
+
43
+ return {
44
+ "platform": "olx",
45
+ "url": url,
46
+ "title": title,
47
+ "description": description,
48
+ "parameters": parameter_list,
49
+ "image_urls": list(unique_links)
50
+ }
51
+
52
+ if __name__ == "__main__":
53
+ url = input("OLX URL: ")
54
+ result = scrape_olx_offer(url)
55
+ print(result)
requirements.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ torch
2
+ torchvision
3
+ transformers
4
+ pillow
5
+ numpy
6
+ scikit-learn
7
+ tqdm
8
+ fastapi
9
+ uvicorn
10
+ python-multipart
11
+ undetected_chromedriver
12
+ webdriver-manager
13
+ bs4
14
+ requests
15
+ flask
16
+ selenium>=4.0
17
+ huggingface_hub