Spaces:
Runtime error
Runtime error
Niv Sardi
commited on
Commit
·
fc32112
1
Parent(s):
f7e5bce
add more euristics to find logos, might be too much
Browse filesSigned-off-by: Niv Sardi <xaiki@evilgiggle.com>
- crawler/common/selectors.py +4 -1
- crawler/imtool.py +1 -1
- crawler/screenshot.py +3 -1
- src/index.ts +29 -3
- src/selectors.ts +3 -1
crawler/common/selectors.py
CHANGED
|
@@ -1,6 +1,9 @@
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
| 4 |
logosbancos = "img[src*=logosbancos]"
|
| 5 |
|
| 6 |
entity_http = "p.post-pagina-interior a[target=_blank][href*=http]"
|
|
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
|
| 3 |
+
img_logo = "img[src*=logo]"
|
| 4 |
+
id_logo = "*[id*=logo]"
|
| 5 |
+
cls_logo = "*[class*=logo]"
|
| 6 |
+
|
| 7 |
logosbancos = "img[src*=logosbancos]"
|
| 8 |
|
| 9 |
entity_http = "p.post-pagina-interior a[target=_blank][href*=http]"
|
crawler/imtool.py
CHANGED
|
@@ -9,7 +9,7 @@ from typing import NamedTuple
|
|
| 9 |
from entity import Entity
|
| 10 |
|
| 11 |
TILE_SIZE = 800
|
| 12 |
-
TILE_OVERLAP = 0.
|
| 13 |
|
| 14 |
class BoundingBox(NamedTuple):
|
| 15 |
x: float = 0.0
|
|
|
|
| 9 |
from entity import Entity
|
| 10 |
|
| 11 |
TILE_SIZE = 800
|
| 12 |
+
TILE_OVERLAP = 0.8
|
| 13 |
|
| 14 |
class BoundingBox(NamedTuple):
|
| 15 |
x: float = 0.0
|
crawler/screenshot.py
CHANGED
|
@@ -27,7 +27,9 @@ def sc_entity(e: Entity):
|
|
| 27 |
driver.save_screenshot(f"{e.DATA_PATH}/{e.bco}.png")
|
| 28 |
driver.save_full_page_screenshot(f"{e.DATA_PATH}/{e.bco}.full.png")
|
| 29 |
|
| 30 |
-
logos = driver.find_elements(By.CSS_SELECTOR, selectors.
|
|
|
|
|
|
|
| 31 |
with open(f"{e.DATA_PATH}/{e.bco}.full.txt", 'w') as f:
|
| 32 |
for i in logos:
|
| 33 |
f.write(f"{e.bco} {coord_to_point(i.rect)}\n")
|
|
|
|
| 27 |
driver.save_screenshot(f"{e.DATA_PATH}/{e.bco}.png")
|
| 28 |
driver.save_full_page_screenshot(f"{e.DATA_PATH}/{e.bco}.full.png")
|
| 29 |
|
| 30 |
+
logos = driver.find_elements(By.CSS_SELECTOR, selectors.img_logo) or []
|
| 31 |
+
logos.extend(driver.find_elements(By.CSS_SELECTOR, selectors.id_logo) or [])
|
| 32 |
+
logos.extend(driver.find_elements(By.CSS_SELECTOR, selectors.cls_logo) or [])
|
| 33 |
with open(f"{e.DATA_PATH}/{e.bco}.full.txt", 'w') as f:
|
| 34 |
for i in logos:
|
| 35 |
f.write(f"{e.bco} {coord_to_point(i.rect)}\n")
|
src/index.ts
CHANGED
|
@@ -22,6 +22,21 @@ queue.addEventListener("idle", async () => {
|
|
| 22 |
console.log("all done")
|
| 23 |
})
|
| 24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
function process(o: { url: string, bco: string, name: string }): Promise<void> {
|
| 26 |
const promises: Promise<void>[] = [];
|
| 27 |
|
|
@@ -30,11 +45,22 @@ function process(o: { url: string, bco: string, name: string }): Promise<void> {
|
|
| 30 |
promises.push(new Promise<void>((accept, _reject) => {
|
| 31 |
page.once('load', async () => {
|
| 32 |
try {
|
| 33 |
-
const
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
let annotations = '';
|
| 35 |
for (const i in logos) {
|
| 36 |
-
const bb =
|
| 37 |
-
if (!bb
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
|
| 39 |
try {
|
| 40 |
await logos[i].screenshot({ path: `./data/logos/${o.bco}.logo${i}.png` })
|
|
|
|
| 22 |
console.log("all done")
|
| 23 |
})
|
| 24 |
|
| 25 |
+
async function get_logos(page, selector): {}[] {
|
| 26 |
+
const logos = await page.$$(selector) || [];
|
| 27 |
+
for (const i in logos) {
|
| 28 |
+
const bb = await page.evaluate(e => {
|
| 29 |
+
const { x, y, width, height } = e.getBoundingClientRect();
|
| 30 |
+
return {
|
| 31 |
+
x, y, width, height, top: window.screen.top, left: window.screen.left
|
| 32 |
+
}
|
| 33 |
+
}, logos[i])
|
| 34 |
+
logos[i].box = bb;
|
| 35 |
+
}
|
| 36 |
+
return logos;
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
|
| 40 |
function process(o: { url: string, bco: string, name: string }): Promise<void> {
|
| 41 |
const promises: Promise<void>[] = [];
|
| 42 |
|
|
|
|
| 45 |
promises.push(new Promise<void>((accept, _reject) => {
|
| 46 |
page.once('load', async () => {
|
| 47 |
try {
|
| 48 |
+
const imgs = await get_logos(page, selectors.img_logo);
|
| 49 |
+
const ids = await get_logos(page, selectors.id_logo);
|
| 50 |
+
const cls = await get_logos(page, selectors.class_logo);
|
| 51 |
+
const logos = [
|
| 52 |
+
...imgs, ...ids, ...cls
|
| 53 |
+
]
|
| 54 |
+
|
| 55 |
let annotations = '';
|
| 56 |
for (const i in logos) {
|
| 57 |
+
const bb = logos[i].box
|
| 58 |
+
if (!bb
|
| 59 |
+
|| (bb.width < 10)
|
| 60 |
+
|| (bb.height < 10)
|
| 61 |
+
|| (bb.x + bb.width < 0)
|
| 62 |
+
|| (bb.y + bb.height < 0)) continue;
|
| 63 |
+
console.log('got bb', o.bco, bb)
|
| 64 |
|
| 65 |
try {
|
| 66 |
await logos[i].screenshot({ path: `./data/logos/${o.bco}.logo${i}.png` })
|
src/selectors.ts
CHANGED
|
@@ -1,5 +1,7 @@
|
|
| 1 |
export default {
|
| 2 |
-
"
|
|
|
|
|
|
|
| 3 |
"logosbancos": "img[src*=logosbancos]",
|
| 4 |
"entity_http": "p.post-pagina-interior a[target=_blank][href*=http]",
|
| 5 |
"entity_mailto": "p.post-pagina-interior a[target=_blank][href*=mailto]"
|
|
|
|
| 1 |
export default {
|
| 2 |
+
"img_logo": "img[src*=logo]",
|
| 3 |
+
"id_logo": "*[id*=logo]",
|
| 4 |
+
"class_logo": "*[class*=logo]",
|
| 5 |
"logosbancos": "img[src*=logosbancos]",
|
| 6 |
"entity_http": "p.post-pagina-interior a[target=_blank][href*=http]",
|
| 7 |
"entity_mailto": "p.post-pagina-interior a[target=_blank][href*=mailto]"
|