Files changed (3) hide show
  1. clean.py +52 -0
  2. crawl.py +73 -0
  3. summarize.py +105 -0
clean.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from tqdm import tqdm
3
+ import json
4
+ files = [os.path.join('/capstor/scratch/cscs/dshah/AI/HospitalData', f) for f in os.listdir('/capstor/scratch/cscs/dshah/AI/HospitalData')]
5
+ os.makedirs('/capstor/scratch/cscs/dshah/AI/CleanedHospitalData', exist_ok=True)
6
+ for file in tqdm(files):
7
+ with open(file, 'r') as f:
8
+ lines = f.read()
9
+ if lines.strip() == '[]':
10
+ continue
11
+ data = lines.strip('[{').strip('}]').split('}, {')
12
+ first = True
13
+ base_url = None
14
+ data_cleaned = []
15
+ for d in data:
16
+ url = d.split(", 'text':")[0].replace("'url': '", "").replace("'", "")
17
+ if first:
18
+ print(url)
19
+ if "://" not in url:
20
+ base_url = url.split("www.")[1].split("/")[0]
21
+ else:
22
+ base_url = url.split("://")[1].split("/")[0]
23
+ first = False
24
+ try:
25
+ text = d.split(", 'text':")[1].strip("'").strip()
26
+ except:
27
+ print(d)
28
+ print("Error in splitting text\n")
29
+ continue
30
+ # print(f"URL: {url}")
31
+ # print(f"Text: {text}\n")
32
+ if base_url not in url:
33
+ print("Different base URL\n")
34
+ continue
35
+ if text == '':
36
+ print("Text is empty\n")
37
+ continue
38
+ if 'Error' in text or 'Not Found' in text:
39
+ print("Error or Not Found in text\n")
40
+ continue
41
+ if len(text) < 100:
42
+ print("Text is too short\n")
43
+ continue
44
+ data_cleaned.append({
45
+ 'url': url.strip(),
46
+ 'text': text.strip()
47
+ })
48
+ if len(data_cleaned) == 0:
49
+ continue
50
+ with open(file.replace('HospitalData', 'CleanedHospitalData') + '.json', 'w') as f:
51
+
52
+ json.dump(data_cleaned, f, indent=4)
crawl.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from bs4 import BeautifulSoup
3
+ from urllib.parse import urljoin
4
+ import collections
5
+ import os
6
+ import requests
7
+ from multiprocessing import Pool
8
+ # ----------- FETCH FUNCTION -----------
9
+ def fetch(url):
10
+ try:
11
+ response = requests.get(url)
12
+ response.raise_for_status() # Raise an error if the request failed
13
+ except requests.exceptions.RequestException as e:
14
+ return f"Error fetching URL: {e}"
15
+
16
+ return response.text
17
+
18
+ # ----------- BFS CRAWLER FOR A SINGLE DOMAIN -----------
19
+ def crawl_bfs_single(start_url):
20
+ if os.path.exists(os.path.join('/capstor/scratch/cscs/dshah/AI/HospitalData' , start_url.replace("https://", "").replace("http://", "").replace("/", "_"))):
21
+ print(f"ℹ️ Skipping {start_url}, already crawled.")
22
+ return # Stop processing if already crawled
23
+ max_depth = 1
24
+ visited = set()
25
+ queue = collections.deque()
26
+ results = []
27
+
28
+ queue.append((start_url, 0))
29
+
30
+ while queue:
31
+ url, depth = queue.popleft()
32
+
33
+ if url in visited or depth > max_depth:
34
+ continue
35
+ visited.add(url)
36
+
37
+ print(f"🌐 Crawling: {url} (depth={depth})")
38
+ html = fetch(url)
39
+ if html is None:
40
+ continue
41
+
42
+ soup = BeautifulSoup(html, "html.parser")
43
+ text = soup.get_text(separator="\n", strip=True)
44
+ results.append({"url": url, "text": text})
45
+
46
+ if depth < max_depth:
47
+ for a in soup.find_all("a", href=True):
48
+ full_url = urljoin(url, a["href"])
49
+ if full_url.startswith("http") and full_url not in visited:
50
+ queue.append((full_url, depth + 1))
51
+ # Save results to a timestamped file
52
+
53
+ from datetime import datetime
54
+ now_str = start_url
55
+ print(f"W /capstor/scratch/cscs/dshah/AI/HospitalData/{ now_str.replace("https://", "").replace("http://", "").replace("/", "_")}")
56
+ try:
57
+ with open(os.path.join('/capstor/scratch/cscs/dshah/AI/HospitalData' , now_str.replace("https://", "").replace("http://", "").replace("/", "_")), 'w') as f:
58
+ f.write(str(results))
59
+ except Exception as e:
60
+ print(f"❌ Error saving results: {e}")
61
+ return results
62
+
63
+
64
+
65
+ # ----------- RUN SCRIPT -----------
66
+ urls = ['https://www.motion-lab.ch/' ,'https://www.adus-klinik.ch/' ,'https://www.medidranse.ch/' ,'https://www.cliniclesalpes.com/' ,'https://www.spitalmaennedorf.ch/notfall/' ,'https://www.polepositif.ch' ,'https://www.tellklinik.ch' ,'https://www.swissmedical.net/fr/swiss-visio/centres/swiss-visio-palezieux' ,'https://kinderklinik.insel.ch/de/' ,'https://www.cmsatigny.ch/' ,'https://www.spitalthun.ch/notfallzentrum' ,'https://www.magellan.ch/centre/centre-imagerie-servette/' ,'https://osteo7-7.ch' ,'https://osteo7-7.ch' ,'https://hochgebirgsklinik.ch/' ,'https://www.sro.ch/' ,'https://www.upk.ch/startseite.html' ,'https://www.eoc.ch/Ospedali-e-Istituti/Ospedale-Regionale-di-Lugano/Civico-e-Italiano/Presentazione.html' ,'https://www.gzo.ch/' ,'https://www.rhne.ch' ,'https://www.eoc.ch/Ospedali-e-Istituti/Ospedale-Regionale-di-Lugano/Civico-e-Italiano/Presentazione.html' ,'https://www.spital-emmental.ch' ,'https://www.genolier.net/' ,'https://www.pdgr.ch/standorte/klinik-waldhaus-chur/' ,'https://www.ilavigny.ch/' ,'https://www.luks.ch' ,'https://www.swsieber.ch/' ,'https://www.lindenhofgruppe.ch/de/standorte/sonnenhof/' ,'https://www.hirslanden.ch/de/klinik-beau-site/home.html' ,'https://www.lasource.ch/' ,'https://www.ksa.ch/' ,'https://www.hirslanden.ch/' ,'https://www.hug-ge.ch/' ,'https://www.obach.ch/' ,'https://www.lindenhofgruppe.ch/de/ueber-uns/standorte/' ,'https://www.ehnv.ch/etablissements/hopital-dyverdon-les-bains' ,'https://www.hug-ge.ch/hopital-loex' ,'https://www.lindenhofgruppe.ch/de/standorte/lindenhof/' ,'https://www.ehc-vd.ch/hopital-morges' ,'https://www.kssg.ch/' ,'https://www.herzzentrum.ch/' ,'https://www.h-fr.ch/hfr/fr/pub/index.htm' ,'https://www.hug.ch/lhopital-de-beau-sejour' ,'https://www.hug-ge.ch/hopital-bellerive/' ,'https://www.spital-oberengadin.ch/' ,'https://nant.ch/' ,'https://beritklinik.ch/berit-klinik-wattwil-landingpage/' ,'https://www.ksw.ch/' ,'https://www.lindberg.ch/' ,'https://www.barmelweid.ch/' ,'https://www.h-och.ch/ueber-uns/standorte/rorschach/' ,'https://www.stgag.ch/fachbereiche/psychiatrische-klinik/psychiatrische-klinik-muensterlingen/' ,'https://www.felixplatter.ch/' ,'https://www.hopitalrivierachablais.ch' ,'https://www.psychiatrie-sg.ch/' ,'https://www.hohenegg.ch/' ,'https://www.spitalmaennedorf.ch/' ,'https://irides.ch/' ,'https://www.hirslanden.ch/de/salem-spital/home.html' ,'https://www.hug-ge.ch/hopital-psychiatrie' ,'http://www.ksbl.ch/das-ksbl/standorte/der-standort-liestal' ,'https://www.spital-emmental.ch/' ,'https://www.unispital-basel.ch/ueber-uns/bereiche/medizinische-querschnittsfunktionen/kliniken-institute-abteilungen/institut-fuer-medizinische-genetik-und-pathologie/pathologie/' ,'https://www.hopitalvs.ch/de/spital-wallis/standorte/spitalzentrum-oberwallis.html' ,'https://www.pukzh.ch/standorte/?locationId=07956ecd-155d-0010-0152-012983939def' ,'https://www.ghol.ch' ,'https://www.eoc.ch/' ,'https://www.ehc-vd.ch/centremedical-aubonne' ,'https://www.csvp.ch' ,'https://oberwaid.ch/' ,'https://www.see-spital.ch/' ,'https://www.insel.ch/' ,'https://www.hopitalduvalais.ch/fr/lhopital-du-valais/sites/sion.html' ,'https://www.spital-lachen.ch/' ,'https://www.hug-ge.ch/crans-montana' ,'www.spital-muri.ch' ,'https://www.ehc-vd.ch/hopital-gilly/' ,'https://spitalthun.ch/' ,'https://www.hopital-broye.ch/' ,'https://www.la-tour.ch/' ,'https://www.spitaluster.ch' ,'http://www.triemli.ch/' ,'https://www.klinikbethanien.ch/' ,'http://www.waidspital.ch/' ,'https://www.spital-schwyz.ch/startseite.html' ,'https://www.spital-einsiedeln.ch' ,'https://www.usz.ch/' ,'https://spitalstsag.ch/' ,'https://www.ksgr.ch/frauenklinik' ,'https://www.ksgr.ch/' ,'https://www.lups.ch/standorte/' ,'https://www.kispi.uzh.ch/rza/de/Seiten/default.aspx' ,'https://www.ksgr.ch/' ,'https://www.spital-limmattal.ch/' ,'https://www.stgag.ch/' ,'https://www.kantonsspitalbaden.ch/' ,'https://adressverzeichnis.sozialearbeit.zhaw.ch/Detail/Index/Clienia_Schloessli_AG_-_Psychiatriezentrum_Uster-Alterstagesklinik-Uster-2ba9eb04-bc44-e311-8b4f-005056a606f6' ,'https://www.gzdielsdorf.ch/' ,'https://www.spital-linth.ch/' ,'https://www.stgag.ch/patienten-besucher/kantonsspital-muensterlingen/besuchszeiten/' ,'https://www.ghol.ch/jcms/ghol_5306/fr/votre-admission?portal=ghol_5842' ,'https://www.vidymed.ch/centre-medical-de-vidy.html' ,'https://www.hirslanden.ch/fr/clinique-bois-cerf/home.html' ,'https://www.hirslanden.ch/fr/clinique-cecil/home.html' ,'https://www.ehnv.ch/etablissements/hopital-de-la-vallee-de-joux' ,'https://gmo.ch/' ,'https://www.spitalzollikerberg.ch/' ,'https://www.hopitalduvalais.ch/fr/lhopital-du-valais/sites/martigny.html' ,'https://www.hopitalduvalais.ch/fr/lhopital-du-valais/sites/clinique-saint-ame.html?ct=0&cHash=58be177c5705953f44610229d3e65550' ,'https://www.hopitalrivierachablais.ch/jcms/c_5020/fr/monthey' ,'https://www.hopitalduvalais.ch/de/spital-wallis/standorte/visp.html' ,'https://www.hopitalduvalais.ch/fr/lhopital-du-valais/sites/sierre.html' ,'https://www.hopital-pae.ch/jcms/pae_10300/en/bienvenue' ,'https://www.hirslanden.ch/de/klinik-permanence/home.html' ,'https://www.unispital-basel.ch/' ,'https://www.rehab.ch/home.html' ,'https://www.kispisg.ch/de' ,'https://geriatrie-sg.ch/' ,'https://www.spitalbelp.ch/de/' ,'https://bethesda-spital.ch/' ,'https://www.hug-ge.ch/joli-mont' ,'https://www.h-fr.ch/' ,'https://www.daler.ch/' ,'https://www.ksbl.ch/das-ksbl/standorte/der-standort-bruderholz' ,'http://www.ksnw.ch' ,'https://www.srft.ch/' ,'https://www.reha-rheinfelden.ch/' ,'https://vertpre.com/fr/' ,'https://www.hug-ge.ch/psychiatrie-enfant-adolescent/unite-hospitalisation-jour' ,'https://www.claraspital.ch/' ,'https://merianiselin.ch/klinik/' ,'https://www.adullam.ch/' ,'https://srrws.ch/' ,'https://www.hirslanden.ch/global/de/startseite/kliniken_zentren/klinik_am_rosenberg.html' ,'https://www.luks.ch/' ,'https://www.luks.ch/standorte/standort-sursee' ,'https://www.klinik-adelheid.ch/' ,'https://www.klinik-arlesheim.ch/' ,'https://www.gzf.ch/startseite.html' ,'https://www.h-fr.ch/hfr/de/pub/patienten/standorte/meyriez.htm' ,'https://www.cliniquegenerale.ch/de/' ,'https://www.eoc.ch/Ospedali-e-Istituti/Ospedale-Regionale-di-Bellinzona-e-Valli/Bellinzona/Presentazione.html' ,'https://www.pdag.ch/' ,'https://www.ksgl.ch' ,'https://www.ksuri.ch/' ,'https://www.santacroce.ch/index.php?view=1000' ,'https://moncucco.ch/' ,'https://www.clinicasantanna.ch/it/' ,'https://www.csbregaglia.ch/' ,'https://www.spitalthusis.ch/' ,'https://www.csvm.ch/' ,'https://www.spitaeler-sh.ch/Patienten-Besucher/Psychiatriezentrum/index.php' ,'https://www.rehaclinic.ch' ,'http://www.spitalleuggern.ch/' ,'https://www.h-ju.ch/fr/L-Hopital-du-jura/Les-4-sites-de-l-hopital/Saignelegier/H-JU-site-de-Saignelegier.html' ,'https://www.h-ne.ch/contact/le-locle' ,'https://www.h-ne.ch/contact/val-de-ruz' ,'https://www.hopital-providence.ch/' ,'https://www.h-fr.ch/nos-sites-hospitaliers/hfr-tafers' ,'https://www.spitalriggisberg.ch/de/' ,'https://www.hirslanden.ch/de/klinik-linde/home.html' ,'https://www.klinikhohmad.ch' ,'https://www.rehaseewis.ch/home.html' ,'https://www.srrws.ch/ueber-uns/organisation/spitaeler/spital-altstaetten.html' ,'https://www.spitalverbund.ch/angebote-heiden-herisau/home-spital-heiden/' ,'https://www.ehnv.ch/jcms/obr_5017/orbe' ,'https://www.ehnv.ch/etablissements/hopital-de-chamblon' ,'https://www.hopitalduvalais.ch/de/spital-wallis/standorte/malevoz.html' ,'https://www.h-fr.ch/hfr/de/pub/dashfr/standorte/billens.htm' ,'http://www.clinique-le-noirmont.ch/' ,'https://www.h-ju.ch/' ,'https://www.h-ju.ch/' ,'https://www.rehabern.ch/' ,'https://www.eoc.ch/Ospedali-e-Istituti/Ospedale-Regionale-di-Locarno/Presentazione.html' ,'https://www.clinicasantachiara.ch/' ,'https://www.arsmedica.ch/it/' ,'https://www.spital-savognin.ch/' ,'https://cseb.ch/' ,'https://www.flurystiftung.ch/Startseite.20.0.html' ,'https://www.stephanshorn.ch/' ,'https://www.clienia.ch/de/standorte/standorte-stationaer/thurgau/littenheid' ,'https://seeklinik-brunnen.ch/' ,'https://www.paraplegie.ch/spz/de' ,'https://www.spitalmenziken.ch/' ,'https://www.gzf.ch/' ,'https://www.hirslanden.ch/de/klinik-belair/home.html' ,'https://klinik-schloss-mammern.ch/de/' ,'https://www.upd.ch/' ,'https://www.solothurnerspitaeler.ch/unsere-spitaeler/buergerspital-solothurn/' ,'https://www.klinik-gais.ch/home/' ,'https://www.usz.ch/standorte/usz-flughafen/' ,'https://www.herz-zentrum.com/muensterlingen/klinik/standorte' ,'https://www.ukbb.ch/' ,'https://www.hopital-broye.ch' ,'https://www.luks.ch/standorte/standort-luzern/kinderspital' ,'https://www.solothurnerspitaeler.ch/unsere-spitaeler/spital-dornach/' ,'https://www.zgks.ch/' ,'https://www.lups.ch/erwachsenen-psychiatrie/allgemeinpsychiatrie/kliniken/' ,'https://www.lups.ch' ,'https://www.spitalaffoltern.ch' ,'https://www.chuv.ch/' ,'https://www.spitalbuelach.ch/' ,'https://www.spitalfmi.ch/index.php?id=2217' ,'https://www.spitaeler-sh.ch/Patienten-Besucher/Kantonsspital-SH/index.php' ,'https://www.andreasklinik.ch/' ,'https://www.siloah.ch' ,'http://www.st-anna.ch' ,'https://www.spitalzentrum-biel.ch' ,'https://www.clienia.ch/' ,'https://www.kispi.uzh.ch/' ,'https://www.solothurnerspitaeler.ch/unsere-spitaeler/kantonsspital-olten/' ,'https://www.ksow.ch/']
67
+ if __name__ == "__main__":
68
+
69
+ with Pool(processes=20) as pool:
70
+ pool.map(crawl_bfs_single, urls)
71
+ print("✅ Crawled all the pages")
72
+
73
+
summarize.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from urllib import response
3
+ import requests
4
+ from tqdm import tqdm
5
+ from openai import OpenAI
6
+
7
+ parent = '/capstor/scratch/cscs/dshah/AI/CleanedHospitalData'
8
+ openrouterapi = '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'
9
+ # 'sk-2defa7c8af874651a32f6ce5c4eb2675'
10
+
11
+ def get_from_or(file_path):
12
+ if file_path.endswith('.json'):
13
+ file_path = os.path.join(parent, file_path)
14
+ else:
15
+ return 'error'
16
+ output_file = file_path.replace('CleanedHospitalData', 'SummarizedCleanedHospitalData')
17
+ if os.path.exists(output_file):
18
+ return 'error'
19
+
20
+
21
+ with open(file_path, 'r') as file:
22
+ content = file.read()
23
+ client = OpenAI(
24
+ base_url="https://openrouter.ai/api/v1",
25
+ api_key=openrouterapi,
26
+ )
27
+
28
+ completion = client.chat.completions.create(
29
+ # model="deepseek/deepseek-chat-v3.1:free",
30
+ model="x-ai/grok-4-fast:free",
31
+
32
+ # messages= [
33
+ # {
34
+ # 'role': 'user',
35
+ # 'content': content,
36
+ # },
37
+ # {'role': 'user', 'content': 'Summarize the above text with Hospital name as the heading and in key word format. Make it as precise as possible and do not assume anything apart from the text. Give me the keywords related to the technologies that are offered by the hospital, The specialties they have and the services they provide. Ignore writing everything else. Do not write any warnings or disclaimers, only useful text.' }
38
+ # ],
39
+ # )
40
+ messages= [
41
+ {
42
+ 'role': 'user',
43
+ 'content': [{'type': 'text', 'text': content}],
44
+ },
45
+ {'role': 'user', 'content': [{'type': 'text', 'text': 'Summarize the above text with Hospital name as the heading and in key word format. Make it as precise as possible and do not assume anything apart from the text. Give me the keywords related to the technologies that are offered by the hospital, The specialties they have and the services they provide. Ignore writing everything else. Do not write any warnings or disclaimers, only useful text.'}] }
46
+ ],
47
+ )
48
+ try:
49
+ output = (completion.choices[0].message.content)
50
+ with open(output_file, 'w') as out_file:
51
+ out_file.write(output)
52
+ return output
53
+ except Exception as e:
54
+ print(f"Error occurred: {e}")
55
+ return 'error'
56
+
57
+
58
+ headers = {
59
+ 'Authorization': 'Bearer ' + 'yae32JuX8lZGeTEWKSMqHjcNlTa0',
60
+ 'Content-Type': 'application/json',
61
+ }
62
+ def get_json_for_file(file_path):
63
+ with open(file_path, 'r') as file:
64
+ content = file.read()
65
+ return {
66
+ 'model': 'swiss-ai/Apertus-70B',
67
+ 'messages': [
68
+ {
69
+ 'role': 'user',
70
+ 'content': content,
71
+ },
72
+ {'role': 'user', 'content': 'Summarize the above text in key word format. Make it as precise as possible and do not assume anything apart from the text. Give me the keywords related to the technologies that are offered by the hospital, The specialties they have and the services they provide. Ignore writing everything else. Do not write any warnings or disclaimers, only useful text.' }
73
+ ],
74
+ }
75
+ # json_data = {
76
+ # 'model': 'swiss-ai/Apertus-70B',
77
+ # 'messages': [
78
+ # {
79
+ # 'role': 'user',
80
+ # 'content': 'Hello, how are you?',
81
+ # },
82
+ # ],
83
+ # }
84
+ # for file in tqdm(os.listdir(parent)):
85
+ files = os.listdir(parent)
86
+ import multiprocessing as mp
87
+ if __name__ == '__main__':
88
+
89
+ with mp.Pool(processes=100) as pool:
90
+ pool.map(get_from_or, files)
91
+
92
+
93
+ # json_data = get_json_for_file(file_path)
94
+ # response = requests.post(
95
+ # 'https://api.swisscom.com/layer/swiss-ai-weeks/apertus-70b/v1/chat/completions',
96
+ # headers=headers,
97
+ # json=json_data,
98
+ # )
99
+ # with open(output_file, 'w') as out_file:
100
+ # out_file.write(response.text)
101
+
102
+ # response_text = get_from_or(file_path)
103
+ # print(response.json())
104
+
105
+