Spaces:
Sleeping
Sleeping
File size: 2,615 Bytes
a8aea21 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 | import logging
import sys
import os
import time
# Add current directory to path so we can import sibling scripts
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
from pinterest_scraper import PinterestScraper, load_config, DEFAULT_QUERIES
# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
logger = logging.getLogger(__name__)
# List of categories that need more images (Target: 2800 raw to get ~1300 clean)
TARGET_CATEGORIES = [
"workshops/design",
"workshops/coding",
"workshops/business",
"tech_fest/hackathon",
"tech_fest/general",
"tech_fest/coding_competition",
"tech_fest/web_app_dev",
"tech_fest/cybersecurity",
"festivals/navratri_garba",
"sports/general"
]
TARGET_COUNT = 2800
def main():
logger.info("π Starting Targeted Scraper for Low-Data Categories")
logger.info(f"π― Target Count: {TARGET_COUNT} images per category")
# Load config from parent directory
config_path = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "configs", "config.yaml")
config = load_config(config_path)
raw_dir = config["paths"]["data"]["raw"]
# Initialize scraper
scraper = PinterestScraper(config, raw_dir)
# Override global target
scraper.TARGET_PER_THEME = TARGET_COUNT
for category in TARGET_CATEGORIES:
logger.info(f"\n============================================================")
logger.info(f"Processing: {category}")
logger.info(f"============================================================")
# Get queries for this category
queries = DEFAULT_QUERIES.get(category)
if not queries:
logger.warning(f"β οΈ No specific queries found for {category}, generating generic ones.")
# Fallback if no specific queries exist (though they should based on our previous edits)
theme = category.split("/")[-1]
queries = [f"{theme} poster design", f"{theme} event flyer", f"creative {theme} poster"]
try:
count = scraper.scrape_category(category, queries)
logger.info(f"β
Finished {category}: {count} total images")
except Exception as e:
logger.error(f"β Failed processing {category}: {e}")
# Small break between categories
time.sleep(2)
logger.info("\nπ All targeted categories processed!")
if __name__ == "__main__":
main()
|