Spaces:
Runtime error
Runtime error
| # Crawler | |
| import requests | |
| from bs4 import BeautifulSoup | |
| import re | |
| import time | |
| def crawl_website(url, max_depth=2, current_depth=0, visited=None): | |
| if visited is None: | |
| visited = set() | |
| if current_depth > max_depth or url in visited: | |
| return [] | |
| visited.add(url) | |
| print(f"Crawling: {url} at depth {current_depth}") | |
| try: | |
| response = requests.get(url, timeout=5) | |
| response.raise_for_status() | |
| except requests.RequestException as e: | |
| print(f"Failed to retrieve {url}: {e}") | |
| return [] | |
| soup = BeautifulSoup(response.text, 'html.parser') | |
| text = soup.get_text(separator='\n', strip=True) | |
| links = [] | |
| for link in soup.find_all('a', href=True): | |
| href = link['href'] | |
| if href.startswith('http'): | |
| links.append(href) | |
| elif href.startswith('/'): | |
| base_url = re.match(r'^(https?://[^/]+)', url) | |
| if base_url: | |
| links.append(base_url.group(1) + href) | |
| for link in links: | |
| text += '\n' + '\n'.join(crawl_website(link, max_depth, current_depth + 1, visited)) | |
| return [text] | |
| if __name__ == "__main__": | |
| urls = ['https://www.motion-lab.ch/' | |
| ,'https://www.adus-klinik.ch/' | |
| ,'https://www.medidranse.ch/' | |
| ,'https://www.cliniclesalpes.com/' | |
| ,'https://www.spitalmaennedorf.ch/notfall/' | |
| ,'https://www.polepositif.ch' | |
| ,'https://www.tellklinik.ch' | |
| ,'https://www.swissmedical.net/fr/swiss-visio/centres/swiss-visio-palezieux' | |
| ,'https://kinderklinik.insel.ch/de/' | |
| ,'https://www.cmsatigny.ch/' | |
| ,'https://www.spitalthun.ch/notfallzentrum' | |
| ,'https://www.magellan.ch/centre/centre-imagerie-servette/' | |
| ,'https://osteo7-7.ch' | |
| ,'https://osteo7-7.ch' | |
| ,'https://hochgebirgsklinik.ch/' | |
| ,'https://www.sro.ch/' | |
| ,'https://www.upk.ch/startseite.html' | |
| ,'https://www.eoc.ch/Ospedali-e-Istituti/Ospedale-Regionale-di-Lugano/Civico-e-Italiano/Presentazione.html' | |
| ,'https://www.gzo.ch/' | |
| ,'https://www.rhne.ch' | |
| ,'https://www.eoc.ch/Ospedali-e-Istituti/Ospedale-Regionale-di-Lugano/Civico-e-Italiano/Presentazione.html' | |
| ,'https://www.spital-emmental.ch' | |
| ,'https://www.genolier.net/' | |
| ,'https://www.pdgr.ch/standorte/klinik-waldhaus-chur/' | |
| ,'https://www.ilavigny.ch/' | |
| ,'https://www.luks.ch' | |
| ,'https://www.swsieber.ch/' | |
| ,'https://www.lindenhofgruppe.ch/de/standorte/sonnenhof/' | |
| ,'https://www.hirslanden.ch/de/klinik-beau-site/home.html' | |
| ,'https://www.lasource.ch/' | |
| ,'https://www.ksa.ch/' | |
| ,'https://www.hirslanden.ch/' | |
| ,'https://www.hug-ge.ch/' | |
| ,'https://www.obach.ch/' | |
| ,'https://www.lindenhofgruppe.ch/de/ueber-uns/standorte/' | |
| ,'https://www.ehnv.ch/etablissements/hopital-dyverdon-les-bains' | |
| ,'https://www.hug-ge.ch/hopital-loex' | |
| ,'https://www.lindenhofgruppe.ch/de/standorte/lindenhof/' | |
| ,'https://www.ehc-vd.ch/hopital-morges' | |
| ,'https://www.kssg.ch/' | |
| ,'https://www.herzzentrum.ch/' | |
| ,'https://www.h-fr.ch/hfr/fr/pub/index.htm' | |
| ,'https://www.hug.ch/lhopital-de-beau-sejour' | |
| ,'https://www.hug-ge.ch/hopital-bellerive/' | |
| ,'https://www.spital-oberengadin.ch/' | |
| ,'https://nant.ch/' | |
| ,'https://beritklinik.ch/berit-klinik-wattwil-landingpage/' | |
| ,'https://www.ksw.ch/' | |
| ,'https://www.lindberg.ch/' | |
| ,'https://www.barmelweid.ch/' | |
| ,'https://www.h-och.ch/ueber-uns/standorte/rorschach/' | |
| ,'https://www.stgag.ch/fachbereiche/psychiatrische-klinik/psychiatrische-klinik-muensterlingen/' | |
| ,'https://www.felixplatter.ch/' | |
| ,'https://www.hopitalrivierachablais.ch' | |
| ,'https://www.psychiatrie-sg.ch/' | |
| ,'https://www.hohenegg.ch/' | |
| ,'https://www.spitalmaennedorf.ch/' | |
| ,'https://irides.ch/' | |
| ,'https://www.hirslanden.ch/de/salem-spital/home.html' | |
| ,'https://www.hug-ge.ch/hopital-psychiatrie' | |
| ,'http://www.ksbl.ch/das-ksbl/standorte/der-standort-liestal' | |
| ,'https://www.spital-emmental.ch/' | |
| ,'https://www.unispital-basel.ch/ueber-uns/bereiche/medizinische-querschnittsfunktionen/kliniken-institute-abteilungen/institut-fuer-medizinische-genetik-und-pathologie/pathologie/' | |
| ,'https://www.hopitalvs.ch/de/spital-wallis/standorte/spitalzentrum-oberwallis.html' | |
| ,'https://www.pukzh.ch/standorte/?locationId=07956ecd-155d-0010-0152-012983939def' | |
| ,'https://www.ghol.ch' | |
| ,'https://www.eoc.ch/' | |
| ,'https://www.ehc-vd.ch/centremedical-aubonne' | |
| ,'https://www.csvp.ch' | |
| ,'https://oberwaid.ch/' | |
| ,'https://www.see-spital.ch/' | |
| ,'https://www.insel.ch/' | |
| ,'https://www.hopitalduvalais.ch/fr/lhopital-du-valais/sites/sion.html' | |
| ,'https://www.spital-lachen.ch/' | |
| ,'https://www.hug-ge.ch/crans-montana' | |
| ,'www.spital-muri.ch' | |
| ,'https://www.ehc-vd.ch/hopital-gilly/' | |
| ,'https://spitalthun.ch/' | |
| ,'https://www.hopital-broye.ch/' | |
| ,'https://www.la-tour.ch/' | |
| ,'https://www.spitaluster.ch' | |
| ,'http://www.triemli.ch/' | |
| ,'https://www.klinikbethanien.ch/' | |
| ,'http://www.waidspital.ch/' | |
| ,'https://www.spital-schwyz.ch/startseite.html' | |
| ,'https://www.spital-einsiedeln.ch' | |
| ,'https://www.usz.ch/' | |
| ,'https://spitalstsag.ch/' | |
| ,'https://www.ksgr.ch/frauenklinik' | |
| ,'https://www.ksgr.ch/' | |
| ,'https://www.lups.ch/standorte/' | |
| ,'https://www.kispi.uzh.ch/rza/de/Seiten/default.aspx' | |
| ,'https://www.ksgr.ch/' | |
| ,'https://www.spital-limmattal.ch/' | |
| ,'https://www.stgag.ch/' | |
| ,'https://www.kantonsspitalbaden.ch/' | |
| ,'https://adressverzeichnis.sozialearbeit.zhaw.ch/Detail/Index/Clienia_Schloessli_AG_-_Psychiatriezentrum_Uster-Alterstagesklinik-Uster-2ba9eb04-bc44-e311-8b4f-005056a606f6' | |
| ,'https://www.gzdielsdorf.ch/' | |
| ,'https://www.spital-linth.ch/' | |
| ,'https://www.stgag.ch/patienten-besucher/kantonsspital-muensterlingen/besuchszeiten/' | |
| ,'https://www.ghol.ch/jcms/ghol_5306/fr/votre-admission?portal=ghol_5842' | |
| ,'https://www.vidymed.ch/centre-medical-de-vidy.html' | |
| ,'https://www.hirslanden.ch/fr/clinique-bois-cerf/home.html' | |
| ,'https://www.hirslanden.ch/fr/clinique-cecil/home.html' | |
| ,'https://www.ehnv.ch/etablissements/hopital-de-la-vallee-de-joux' | |
| ,'https://gmo.ch/' | |
| ,'https://www.spitalzollikerberg.ch/' | |
| ,'https://www.hopitalduvalais.ch/fr/lhopital-du-valais/sites/martigny.html' | |
| ,'https://www.hopitalduvalais.ch/fr/lhopital-du-valais/sites/clinique-saint-ame.html?ct=0&cHash=58be177c5705953f44610229d3e65550' | |
| ,'https://www.hopitalrivierachablais.ch/jcms/c_5020/fr/monthey' | |
| ,'https://www.hopitalduvalais.ch/de/spital-wallis/standorte/visp.html' | |
| ,'https://www.hopitalduvalais.ch/fr/lhopital-du-valais/sites/sierre.html' | |
| ,'https://www.hopital-pae.ch/jcms/pae_10300/en/bienvenue' | |
| ,'https://www.hirslanden.ch/de/klinik-permanence/home.html' | |
| ,'https://www.unispital-basel.ch/' | |
| ,'https://www.rehab.ch/home.html' | |
| ,'https://www.kispisg.ch/de' | |
| ,'https://geriatrie-sg.ch/' | |
| ,'https://www.spitalbelp.ch/de/' | |
| ,'https://bethesda-spital.ch/' | |
| ,'https://www.hug-ge.ch/joli-mont' | |
| ,'https://www.h-fr.ch/' | |
| ,'https://www.daler.ch/' | |
| ,'https://www.ksbl.ch/das-ksbl/standorte/der-standort-bruderholz' | |
| ,'http://www.ksnw.ch' | |
| ,'https://www.srft.ch/' | |
| ,'https://www.reha-rheinfelden.ch/' | |
| ,'https://vertpre.com/fr/' | |
| ,'https://www.hug-ge.ch/psychiatrie-enfant-adolescent/unite-hospitalisation-jour' | |
| ,'https://www.claraspital.ch/' | |
| ,'https://merianiselin.ch/klinik/' | |
| ,'https://www.adullam.ch/' | |
| ,'https://srrws.ch/' | |
| ,'https://www.hirslanden.ch/global/de/startseite/kliniken_zentren/klinik_am_rosenberg.html' | |
| ,'https://www.luks.ch/' | |
| ,'https://www.luks.ch/standorte/standort-sursee' | |
| ,'https://www.klinik-adelheid.ch/' | |
| ,'https://www.klinik-arlesheim.ch/' | |
| ,'https://www.gzf.ch/startseite.html' | |
| ,'https://www.h-fr.ch/hfr/de/pub/patienten/standorte/meyriez.htm' | |
| ,'https://www.cliniquegenerale.ch/de/' | |
| ,'https://www.eoc.ch/Ospedali-e-Istituti/Ospedale-Regionale-di-Bellinzona-e-Valli/Bellinzona/Presentazione.html' | |
| ,'https://www.pdag.ch/' | |
| ,'https://www.ksgl.ch' | |
| ,'https://www.ksuri.ch/' | |
| ,'https://www.santacroce.ch/index.php?view=1000' | |
| ,'https://moncucco.ch/' | |
| ,'https://www.clinicasantanna.ch/it/' | |
| ,'https://www.csbregaglia.ch/' | |
| ,'https://www.spitalthusis.ch/' | |
| ,'https://www.csvm.ch/' | |
| ,'https://www.spitaeler-sh.ch/Patienten-Besucher/Psychiatriezentrum/index.php' | |
| ,'https://www.rehaclinic.ch' | |
| ,'http://www.spitalleuggern.ch/' | |
| ,'https://www.h-ju.ch/fr/L-Hopital-du-jura/Les-4-sites-de-l-hopital/Saignelegier/H-JU-site-de-Saignelegier.html' | |
| ,'https://www.h-ne.ch/contact/le-locle' | |
| ,'https://www.h-ne.ch/contact/val-de-ruz' | |
| ,'https://www.hopital-providence.ch/' | |
| ,'https://www.h-fr.ch/nos-sites-hospitaliers/hfr-tafers' | |
| ,'https://www.spitalriggisberg.ch/de/' | |
| ,'https://www.hirslanden.ch/de/klinik-linde/home.html' | |
| ,'https://www.klinikhohmad.ch' | |
| ,'https://www.rehaseewis.ch/home.html' | |
| ,'https://www.srrws.ch/ueber-uns/organisation/spitaeler/spital-altstaetten.html' | |
| ,'https://www.spitalverbund.ch/angebote-heiden-herisau/home-spital-heiden/' | |
| ,'https://www.ehnv.ch/jcms/obr_5017/orbe' | |
| ,'https://www.ehnv.ch/etablissements/hopital-de-chamblon' | |
| ,'https://www.hopitalduvalais.ch/de/spital-wallis/standorte/malevoz.html' | |
| ,'https://www.h-fr.ch/hfr/de/pub/dashfr/standorte/billens.htm' | |
| ,'http://www.clinique-le-noirmont.ch/' | |
| ,'https://www.h-ju.ch/' | |
| ,'https://www.h-ju.ch/' | |
| ,'https://www.rehabern.ch/' | |
| ,'https://www.eoc.ch/Ospedali-e-Istituti/Ospedale-Regionale-di-Locarno/Presentazione.html' | |
| ,'https://www.clinicasantachiara.ch/' | |
| ,'https://www.arsmedica.ch/it/' | |
| ,'https://www.spital-savognin.ch/' | |
| ,'https://cseb.ch/' | |
| ,'https://www.flurystiftung.ch/Startseite.20.0.html' | |
| ,'https://www.stephanshorn.ch/' | |
| ,'https://www.clienia.ch/de/standorte/standorte-stationaer/thurgau/littenheid' | |
| ,'https://seeklinik-brunnen.ch/' | |
| ,'https://www.paraplegie.ch/spz/de' | |
| ,'https://www.spitalmenziken.ch/' | |
| ,'https://www.gzf.ch/' | |
| ,'https://www.hirslanden.ch/de/klinik-belair/home.html' | |
| ,'https://klinik-schloss-mammern.ch/de/' | |
| ,'https://www.upd.ch/' | |
| ,'https://www.solothurnerspitaeler.ch/unsere-spitaeler/buergerspital-solothurn/' | |
| ,'https://www.klinik-gais.ch/home/' | |
| ,'https://www.usz.ch/standorte/usz-flughafen/' | |
| ,'https://www.herz-zentrum.com/muensterlingen/klinik/standorte' | |
| ,'https://www.ukbb.ch/' | |
| ,'https://www.hopital-broye.ch' | |
| ,'https://www.luks.ch/standorte/standort-luzern/kinderspital' | |
| ,'https://www.solothurnerspitaeler.ch/unsere-spitaeler/spital-dornach/' | |
| ,'https://www.zgks.ch/' | |
| ,'https://www.lups.ch/erwachsenen-psychiatrie/allgemeinpsychiatrie/kliniken/' | |
| ,'https://www.lups.ch' | |
| ,'https://www.spitalaffoltern.ch' | |
| ,'https://www.chuv.ch/' | |
| ,'https://www.spitalbuelach.ch/' | |
| ,'https://www.spitalfmi.ch/index.php?id=2217' | |
| ,'https://www.spitaeler-sh.ch/Patienten-Besucher/Kantonsspital-SH/index.php' | |
| ,'https://www.andreasklinik.ch/' | |
| ,'https://www.siloah.ch' | |
| ,'http://www.st-anna.ch' | |
| ,'https://www.spitalzentrum-biel.ch' | |
| ,'https://www.clienia.ch/' | |
| ,'https://www.kispi.uzh.ch/' | |
| ,'https://www.solothurnerspitaeler.ch/unsere-spitaeler/kantonsspital-olten/' | |
| ,'https://www.ksow.ch/'] | |
| import sys | |
| from tqdm import tqdm | |
| count = sys.argv[1] | |
| total = sys.argv[2] | |
| for i, url in tqdm(enumerate(urls)): | |
| if i % int(total) != int(count): | |
| continue | |
| print(f"Processing {i+1}/{len(urls)}: {url}") | |
| time.sleep(2) # Be polite and avoid overwhelming the server | |
| documents = crawl_website(url, max_depth=1) | |
| with open(f"/capstor/scratch/cscs/dshah/AI/HospitalData/{url}", 'w', encoding='utf-8') as f: | |
| for doc in documents: | |
| f.write(doc + "\n//\n") | |
| # f.write("\n//\n".join(documents)) | |
| print(f"Saved {len(documents)} documents from {url}") | |
| time.sleep(2) # Be polite and avoid overwhelming the server | |
| # break # Remove this break to process all URLs | |
| # print(documents) |