Spaces:
Sleeping
Sleeping
| """ | |
| Scraper SIMPLE pour Hyperplanning - Sans navigateur | |
| Utilise uniquement requests + BeautifulSoup | |
| """ | |
| import requests | |
| from bs4 import BeautifulSoup | |
| import json | |
| import re | |
| from datetime import datetime | |
| import time | |
| FIREBASE_DB_URL = 'https://classehub-40d27-default-rtdb.firebaseio.com' | |
| def save_to_firebase(data): | |
| """Sauvegarde dans Firebase""" | |
| try: | |
| response = requests.put(f'{FIREBASE_DB_URL}/emploi_du_temps.json', json=data) | |
| if response.status_code == 200: | |
| print(f"✅ Sauvegardé dans Firebase") | |
| return True | |
| else: | |
| print(f"❌ Erreur Firebase: {response.status_code}") | |
| return False | |
| except Exception as e: | |
| print(f"❌ Erreur Firebase: {e}") | |
| return False | |
| class HyperplanningScraperSimple: | |
| def __init__(self): | |
| self.base_url = "https://svr-appweb.2ie-edu.org" | |
| self.session = requests.Session() | |
| self.session.headers.update({ | |
| 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', | |
| 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', | |
| 'Accept-Language': 'fr-FR,fr;q=0.9,en;q=0.8', | |
| 'Accept-Encoding': 'gzip, deflate, br', | |
| 'Connection': 'keep-alive', | |
| }) | |
| self.promotion = "S8 IA CYCLE ING IIA (2ème année)" | |
| def extraire_donnees_page(self, html): | |
| """Cherche les données JSON dans le HTML""" | |
| try: | |
| # Méthode 1: Scripts avec JSON | |
| soup = BeautifulSoup(html, 'html.parser') | |
| scripts = soup.find_all('script') | |
| for script in scripts: | |
| if script.string: | |
| # Chercher des patterns JSON | |
| if 'cours' in script.string or 'planning' in script.string or 'emploi' in script.string: | |
| # Essayer d'extraire le JSON | |
| matches = re.findall(r'\{[^{}]*"cours"[^{}]*\}', script.string) | |
| if matches: | |
| print(f" ✓ JSON trouvé dans script") | |
| return matches | |
| # Méthode 2: Divs avec data attributes | |
| data_divs = soup.find_all(attrs={'data-cours': True}) | |
| if data_divs: | |
| print(f" ✓ {len(data_divs)} éléments avec data-cours") | |
| return [div.get('data-cours') for div in data_divs] | |
| # Méthode 3: Classes de cours | |
| cours_elements = soup.find_all(class_=re.compile(r'cours|course|event')) | |
| if cours_elements: | |
| print(f" ✓ {len(cours_elements)} éléments de cours détectés") | |
| return cours_elements | |
| return None | |
| except Exception as e: | |
| print(f" ❌ Erreur extraction: {e}") | |
| return None | |
| def tester_endpoints_api(self): | |
| """Teste différents endpoints API possibles""" | |
| print("🔍 Test des endpoints API...") | |
| endpoints = [ | |
| "/hp/api/planning", | |
| "/hp/api/cours", | |
| "/hp/api/emploi", | |
| "/hp/data/planning.json", | |
| "/hp/data/emploi.json", | |
| "/hp/getplanning", | |
| "/hp/getcours", | |
| ] | |
| for endpoint in endpoints: | |
| try: | |
| url = f"{self.base_url}{endpoint}" | |
| print(f" Essai: {endpoint}") | |
| resp = self.session.get(url, timeout=10) | |
| if resp.status_code == 200: | |
| print(f" ✓ {endpoint} accessible!") | |
| if 'json' in resp.headers.get('Content-Type', ''): | |
| data = resp.json() | |
| print(f" ✓ JSON reçu: {len(str(data))} chars") | |
| return data | |
| else: | |
| print(f" → HTML reçu") | |
| except Exception as e: | |
| print(f" ✗ {endpoint}: {type(e).__name__}") | |
| return None | |
| def scraper_page_invite(self): | |
| """Scrape la page invite""" | |
| print("🌐 Accès à la page invite...") | |
| try: | |
| url = f"{self.base_url}/hp/invite" | |
| print(f" URL: {url}") | |
| resp = self.session.get(url, timeout=30) | |
| print(f" ✓ Status: {resp.status_code}") | |
| print(f" ✓ Taille: {len(resp.text)} chars") | |
| # Chercher les données | |
| donnees = self.extraire_donnees_page(resp.text) | |
| if donnees: | |
| print(f" ✓ Données extraites: {len(donnees)} éléments") | |
| return donnees | |
| else: | |
| print(f" ⚠️ Pas de données structurées trouvées") | |
| # Sauvegarder un échantillon pour debug | |
| soup = BeautifulSoup(resp.text, 'html.parser') | |
| print(f" → Title: {soup.title.string if soup.title else 'N/A'}") | |
| # Chercher des indices | |
| if 'Start(' in resp.text: | |
| match = re.search(r'Start\([^)]+\)', resp.text) | |
| if match: | |
| print(f" → Fonction Start() détectée: {match.group()[:100]}") | |
| return None | |
| except Exception as e: | |
| print(f" ❌ Erreur: {e}") | |
| return None | |
| def scraper(self): | |
| """Méthode principale - Essaie plusieurs approches""" | |
| print("=" * 60) | |
| print("🚀 SCRAPER SIMPLE (requests only)") | |
| print("=" * 60) | |
| print("") | |
| # Approche 1: Endpoints API | |
| print("📡 APPROCHE 1: Endpoints API directs") | |
| data = self.tester_endpoints_api() | |
| if data: | |
| print("✅ Données trouvées via API!") | |
| return self.formater_donnees(data) | |
| print("") | |
| # Approche 2: Page HTML | |
| print("📄 APPROCHE 2: Extraction depuis HTML") | |
| data = self.scraper_page_invite() | |
| if data: | |
| print("✅ Données extraites du HTML!") | |
| return self.formater_donnees(data) | |
| print("") | |
| # Approche 3: Fallback Firebase | |
| print("💾 APPROCHE 3: Lecture depuis Firebase (fallback)") | |
| try: | |
| resp = requests.get(f'{FIREBASE_DB_URL}/emploi_du_temps.json') | |
| if resp.status_code == 200 and resp.json(): | |
| print("✅ Données lues depuis Firebase") | |
| return resp.json() | |
| except: | |
| pass | |
| print("") | |
| print("❌ ÉCHEC: Aucune approche n'a fonctionné") | |
| print("=" * 60) | |
| return None | |
| def formater_donnees(self, data): | |
| """Formate les données extraites""" | |
| # Format basique pour le moment | |
| return { | |
| "promotion": self.promotion, | |
| "date_extraction": datetime.now().isoformat(), | |
| "raw_data": str(data)[:1000], # Limité pour debug | |
| "success": True | |
| } | |
| def main(): | |
| """Point d'entrée""" | |
| scraper = HyperplanningScraperSimple() | |
| resultat = scraper.scraper() | |
| if resultat: | |
| save_to_firebase(resultat) | |
| return resultat | |
| return None | |
| if __name__ == "__main__": | |
| main() | |