Spaces:
Sleeping
Sleeping
Upload 3 files
#3
by
Muzansama - opened
- clean.py +52 -0
- crawl.py +73 -0
- summarize.py +105 -0
clean.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from tqdm import tqdm
|
| 3 |
+
import json
|
| 4 |
+
files = [os.path.join('/capstor/scratch/cscs/dshah/AI/HospitalData', f) for f in os.listdir('/capstor/scratch/cscs/dshah/AI/HospitalData')]
|
| 5 |
+
os.makedirs('/capstor/scratch/cscs/dshah/AI/CleanedHospitalData', exist_ok=True)
|
| 6 |
+
for file in tqdm(files):
|
| 7 |
+
with open(file, 'r') as f:
|
| 8 |
+
lines = f.read()
|
| 9 |
+
if lines.strip() == '[]':
|
| 10 |
+
continue
|
| 11 |
+
data = lines.strip('[{').strip('}]').split('}, {')
|
| 12 |
+
first = True
|
| 13 |
+
base_url = None
|
| 14 |
+
data_cleaned = []
|
| 15 |
+
for d in data:
|
| 16 |
+
url = d.split(", 'text':")[0].replace("'url': '", "").replace("'", "")
|
| 17 |
+
if first:
|
| 18 |
+
print(url)
|
| 19 |
+
if "://" not in url:
|
| 20 |
+
base_url = url.split("www.")[1].split("/")[0]
|
| 21 |
+
else:
|
| 22 |
+
base_url = url.split("://")[1].split("/")[0]
|
| 23 |
+
first = False
|
| 24 |
+
try:
|
| 25 |
+
text = d.split(", 'text':")[1].strip("'").strip()
|
| 26 |
+
except:
|
| 27 |
+
print(d)
|
| 28 |
+
print("Error in splitting text\n")
|
| 29 |
+
continue
|
| 30 |
+
# print(f"URL: {url}")
|
| 31 |
+
# print(f"Text: {text}\n")
|
| 32 |
+
if base_url not in url:
|
| 33 |
+
print("Different base URL\n")
|
| 34 |
+
continue
|
| 35 |
+
if text == '':
|
| 36 |
+
print("Text is empty\n")
|
| 37 |
+
continue
|
| 38 |
+
if 'Error' in text or 'Not Found' in text:
|
| 39 |
+
print("Error or Not Found in text\n")
|
| 40 |
+
continue
|
| 41 |
+
if len(text) < 100:
|
| 42 |
+
print("Text is too short\n")
|
| 43 |
+
continue
|
| 44 |
+
data_cleaned.append({
|
| 45 |
+
'url': url.strip(),
|
| 46 |
+
'text': text.strip()
|
| 47 |
+
})
|
| 48 |
+
if len(data_cleaned) == 0:
|
| 49 |
+
continue
|
| 50 |
+
with open(file.replace('HospitalData', 'CleanedHospitalData') + '.json', 'w') as f:
|
| 51 |
+
|
| 52 |
+
json.dump(data_cleaned, f, indent=4)
|
crawl.py
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
from bs4 import BeautifulSoup
|
| 3 |
+
from urllib.parse import urljoin
|
| 4 |
+
import collections
|
| 5 |
+
import os
|
| 6 |
+
import requests
|
| 7 |
+
from multiprocessing import Pool
|
| 8 |
+
# ----------- FETCH FUNCTION -----------
|
| 9 |
+
def fetch(url):
|
| 10 |
+
try:
|
| 11 |
+
response = requests.get(url)
|
| 12 |
+
response.raise_for_status() # Raise an error if the request failed
|
| 13 |
+
except requests.exceptions.RequestException as e:
|
| 14 |
+
return f"Error fetching URL: {e}"
|
| 15 |
+
|
| 16 |
+
return response.text
|
| 17 |
+
|
| 18 |
+
# ----------- BFS CRAWLER FOR A SINGLE DOMAIN -----------
|
| 19 |
+
def crawl_bfs_single(start_url):
|
| 20 |
+
if os.path.exists(os.path.join('/capstor/scratch/cscs/dshah/AI/HospitalData' , start_url.replace("https://", "").replace("http://", "").replace("/", "_"))):
|
| 21 |
+
print(f"ℹ️ Skipping {start_url}, already crawled.")
|
| 22 |
+
return # Stop processing if already crawled
|
| 23 |
+
max_depth = 1
|
| 24 |
+
visited = set()
|
| 25 |
+
queue = collections.deque()
|
| 26 |
+
results = []
|
| 27 |
+
|
| 28 |
+
queue.append((start_url, 0))
|
| 29 |
+
|
| 30 |
+
while queue:
|
| 31 |
+
url, depth = queue.popleft()
|
| 32 |
+
|
| 33 |
+
if url in visited or depth > max_depth:
|
| 34 |
+
continue
|
| 35 |
+
visited.add(url)
|
| 36 |
+
|
| 37 |
+
print(f"🌐 Crawling: {url} (depth={depth})")
|
| 38 |
+
html = fetch(url)
|
| 39 |
+
if html is None:
|
| 40 |
+
continue
|
| 41 |
+
|
| 42 |
+
soup = BeautifulSoup(html, "html.parser")
|
| 43 |
+
text = soup.get_text(separator="\n", strip=True)
|
| 44 |
+
results.append({"url": url, "text": text})
|
| 45 |
+
|
| 46 |
+
if depth < max_depth:
|
| 47 |
+
for a in soup.find_all("a", href=True):
|
| 48 |
+
full_url = urljoin(url, a["href"])
|
| 49 |
+
if full_url.startswith("http") and full_url not in visited:
|
| 50 |
+
queue.append((full_url, depth + 1))
|
| 51 |
+
# Save results to a timestamped file
|
| 52 |
+
|
| 53 |
+
from datetime import datetime
|
| 54 |
+
now_str = start_url
|
| 55 |
+
print(f"W /capstor/scratch/cscs/dshah/AI/HospitalData/{ now_str.replace("https://", "").replace("http://", "").replace("/", "_")}")
|
| 56 |
+
try:
|
| 57 |
+
with open(os.path.join('/capstor/scratch/cscs/dshah/AI/HospitalData' , now_str.replace("https://", "").replace("http://", "").replace("/", "_")), 'w') as f:
|
| 58 |
+
f.write(str(results))
|
| 59 |
+
except Exception as e:
|
| 60 |
+
print(f"❌ Error saving results: {e}")
|
| 61 |
+
return results
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
# ----------- RUN SCRIPT -----------
|
| 66 |
+
urls = ['https://www.motion-lab.ch/' ,'https://www.adus-klinik.ch/' ,'https://www.medidranse.ch/' ,'https://www.cliniclesalpes.com/' ,'https://www.spitalmaennedorf.ch/notfall/' ,'https://www.polepositif.ch' ,'https://www.tellklinik.ch' ,'https://www.swissmedical.net/fr/swiss-visio/centres/swiss-visio-palezieux' ,'https://kinderklinik.insel.ch/de/' ,'https://www.cmsatigny.ch/' ,'https://www.spitalthun.ch/notfallzentrum' ,'https://www.magellan.ch/centre/centre-imagerie-servette/' ,'https://osteo7-7.ch' ,'https://osteo7-7.ch' ,'https://hochgebirgsklinik.ch/' ,'https://www.sro.ch/' ,'https://www.upk.ch/startseite.html' ,'https://www.eoc.ch/Ospedali-e-Istituti/Ospedale-Regionale-di-Lugano/Civico-e-Italiano/Presentazione.html' ,'https://www.gzo.ch/' ,'https://www.rhne.ch' ,'https://www.eoc.ch/Ospedali-e-Istituti/Ospedale-Regionale-di-Lugano/Civico-e-Italiano/Presentazione.html' ,'https://www.spital-emmental.ch' ,'https://www.genolier.net/' ,'https://www.pdgr.ch/standorte/klinik-waldhaus-chur/' ,'https://www.ilavigny.ch/' ,'https://www.luks.ch' ,'https://www.swsieber.ch/' ,'https://www.lindenhofgruppe.ch/de/standorte/sonnenhof/' ,'https://www.hirslanden.ch/de/klinik-beau-site/home.html' ,'https://www.lasource.ch/' ,'https://www.ksa.ch/' ,'https://www.hirslanden.ch/' ,'https://www.hug-ge.ch/' ,'https://www.obach.ch/' ,'https://www.lindenhofgruppe.ch/de/ueber-uns/standorte/' ,'https://www.ehnv.ch/etablissements/hopital-dyverdon-les-bains' ,'https://www.hug-ge.ch/hopital-loex' ,'https://www.lindenhofgruppe.ch/de/standorte/lindenhof/' ,'https://www.ehc-vd.ch/hopital-morges' ,'https://www.kssg.ch/' ,'https://www.herzzentrum.ch/' ,'https://www.h-fr.ch/hfr/fr/pub/index.htm' ,'https://www.hug.ch/lhopital-de-beau-sejour' ,'https://www.hug-ge.ch/hopital-bellerive/' ,'https://www.spital-oberengadin.ch/' ,'https://nant.ch/' ,'https://beritklinik.ch/berit-klinik-wattwil-landingpage/' ,'https://www.ksw.ch/' ,'https://www.lindberg.ch/' ,'https://www.barmelweid.ch/' ,'https://www.h-och.ch/ueber-uns/standorte/rorschach/' ,'https://www.stgag.ch/fachbereiche/psychiatrische-klinik/psychiatrische-klinik-muensterlingen/' ,'https://www.felixplatter.ch/' ,'https://www.hopitalrivierachablais.ch' ,'https://www.psychiatrie-sg.ch/' ,'https://www.hohenegg.ch/' ,'https://www.spitalmaennedorf.ch/' ,'https://irides.ch/' ,'https://www.hirslanden.ch/de/salem-spital/home.html' ,'https://www.hug-ge.ch/hopital-psychiatrie' ,'http://www.ksbl.ch/das-ksbl/standorte/der-standort-liestal' ,'https://www.spital-emmental.ch/' ,'https://www.unispital-basel.ch/ueber-uns/bereiche/medizinische-querschnittsfunktionen/kliniken-institute-abteilungen/institut-fuer-medizinische-genetik-und-pathologie/pathologie/' ,'https://www.hopitalvs.ch/de/spital-wallis/standorte/spitalzentrum-oberwallis.html' ,'https://www.pukzh.ch/standorte/?locationId=07956ecd-155d-0010-0152-012983939def' ,'https://www.ghol.ch' ,'https://www.eoc.ch/' ,'https://www.ehc-vd.ch/centremedical-aubonne' ,'https://www.csvp.ch' ,'https://oberwaid.ch/' ,'https://www.see-spital.ch/' ,'https://www.insel.ch/' ,'https://www.hopitalduvalais.ch/fr/lhopital-du-valais/sites/sion.html' ,'https://www.spital-lachen.ch/' ,'https://www.hug-ge.ch/crans-montana' ,'www.spital-muri.ch' ,'https://www.ehc-vd.ch/hopital-gilly/' ,'https://spitalthun.ch/' ,'https://www.hopital-broye.ch/' ,'https://www.la-tour.ch/' ,'https://www.spitaluster.ch' ,'http://www.triemli.ch/' ,'https://www.klinikbethanien.ch/' ,'http://www.waidspital.ch/' ,'https://www.spital-schwyz.ch/startseite.html' ,'https://www.spital-einsiedeln.ch' ,'https://www.usz.ch/' ,'https://spitalstsag.ch/' ,'https://www.ksgr.ch/frauenklinik' ,'https://www.ksgr.ch/' ,'https://www.lups.ch/standorte/' ,'https://www.kispi.uzh.ch/rza/de/Seiten/default.aspx' ,'https://www.ksgr.ch/' ,'https://www.spital-limmattal.ch/' ,'https://www.stgag.ch/' ,'https://www.kantonsspitalbaden.ch/' ,'https://adressverzeichnis.sozialearbeit.zhaw.ch/Detail/Index/Clienia_Schloessli_AG_-_Psychiatriezentrum_Uster-Alterstagesklinik-Uster-2ba9eb04-bc44-e311-8b4f-005056a606f6' ,'https://www.gzdielsdorf.ch/' ,'https://www.spital-linth.ch/' ,'https://www.stgag.ch/patienten-besucher/kantonsspital-muensterlingen/besuchszeiten/' ,'https://www.ghol.ch/jcms/ghol_5306/fr/votre-admission?portal=ghol_5842' ,'https://www.vidymed.ch/centre-medical-de-vidy.html' ,'https://www.hirslanden.ch/fr/clinique-bois-cerf/home.html' ,'https://www.hirslanden.ch/fr/clinique-cecil/home.html' ,'https://www.ehnv.ch/etablissements/hopital-de-la-vallee-de-joux' ,'https://gmo.ch/' ,'https://www.spitalzollikerberg.ch/' ,'https://www.hopitalduvalais.ch/fr/lhopital-du-valais/sites/martigny.html' ,'https://www.hopitalduvalais.ch/fr/lhopital-du-valais/sites/clinique-saint-ame.html?ct=0&cHash=58be177c5705953f44610229d3e65550' ,'https://www.hopitalrivierachablais.ch/jcms/c_5020/fr/monthey' ,'https://www.hopitalduvalais.ch/de/spital-wallis/standorte/visp.html' ,'https://www.hopitalduvalais.ch/fr/lhopital-du-valais/sites/sierre.html' ,'https://www.hopital-pae.ch/jcms/pae_10300/en/bienvenue' ,'https://www.hirslanden.ch/de/klinik-permanence/home.html' ,'https://www.unispital-basel.ch/' ,'https://www.rehab.ch/home.html' ,'https://www.kispisg.ch/de' ,'https://geriatrie-sg.ch/' ,'https://www.spitalbelp.ch/de/' ,'https://bethesda-spital.ch/' ,'https://www.hug-ge.ch/joli-mont' ,'https://www.h-fr.ch/' ,'https://www.daler.ch/' ,'https://www.ksbl.ch/das-ksbl/standorte/der-standort-bruderholz' ,'http://www.ksnw.ch' ,'https://www.srft.ch/' ,'https://www.reha-rheinfelden.ch/' ,'https://vertpre.com/fr/' ,'https://www.hug-ge.ch/psychiatrie-enfant-adolescent/unite-hospitalisation-jour' ,'https://www.claraspital.ch/' ,'https://merianiselin.ch/klinik/' ,'https://www.adullam.ch/' ,'https://srrws.ch/' ,'https://www.hirslanden.ch/global/de/startseite/kliniken_zentren/klinik_am_rosenberg.html' ,'https://www.luks.ch/' ,'https://www.luks.ch/standorte/standort-sursee' ,'https://www.klinik-adelheid.ch/' ,'https://www.klinik-arlesheim.ch/' ,'https://www.gzf.ch/startseite.html' ,'https://www.h-fr.ch/hfr/de/pub/patienten/standorte/meyriez.htm' ,'https://www.cliniquegenerale.ch/de/' ,'https://www.eoc.ch/Ospedali-e-Istituti/Ospedale-Regionale-di-Bellinzona-e-Valli/Bellinzona/Presentazione.html' ,'https://www.pdag.ch/' ,'https://www.ksgl.ch' ,'https://www.ksuri.ch/' ,'https://www.santacroce.ch/index.php?view=1000' ,'https://moncucco.ch/' ,'https://www.clinicasantanna.ch/it/' ,'https://www.csbregaglia.ch/' ,'https://www.spitalthusis.ch/' ,'https://www.csvm.ch/' ,'https://www.spitaeler-sh.ch/Patienten-Besucher/Psychiatriezentrum/index.php' ,'https://www.rehaclinic.ch' ,'http://www.spitalleuggern.ch/' ,'https://www.h-ju.ch/fr/L-Hopital-du-jura/Les-4-sites-de-l-hopital/Saignelegier/H-JU-site-de-Saignelegier.html' ,'https://www.h-ne.ch/contact/le-locle' ,'https://www.h-ne.ch/contact/val-de-ruz' ,'https://www.hopital-providence.ch/' ,'https://www.h-fr.ch/nos-sites-hospitaliers/hfr-tafers' ,'https://www.spitalriggisberg.ch/de/' ,'https://www.hirslanden.ch/de/klinik-linde/home.html' ,'https://www.klinikhohmad.ch' ,'https://www.rehaseewis.ch/home.html' ,'https://www.srrws.ch/ueber-uns/organisation/spitaeler/spital-altstaetten.html' ,'https://www.spitalverbund.ch/angebote-heiden-herisau/home-spital-heiden/' ,'https://www.ehnv.ch/jcms/obr_5017/orbe' ,'https://www.ehnv.ch/etablissements/hopital-de-chamblon' ,'https://www.hopitalduvalais.ch/de/spital-wallis/standorte/malevoz.html' ,'https://www.h-fr.ch/hfr/de/pub/dashfr/standorte/billens.htm' ,'http://www.clinique-le-noirmont.ch/' ,'https://www.h-ju.ch/' ,'https://www.h-ju.ch/' ,'https://www.rehabern.ch/' ,'https://www.eoc.ch/Ospedali-e-Istituti/Ospedale-Regionale-di-Locarno/Presentazione.html' ,'https://www.clinicasantachiara.ch/' ,'https://www.arsmedica.ch/it/' ,'https://www.spital-savognin.ch/' ,'https://cseb.ch/' ,'https://www.flurystiftung.ch/Startseite.20.0.html' ,'https://www.stephanshorn.ch/' ,'https://www.clienia.ch/de/standorte/standorte-stationaer/thurgau/littenheid' ,'https://seeklinik-brunnen.ch/' ,'https://www.paraplegie.ch/spz/de' ,'https://www.spitalmenziken.ch/' ,'https://www.gzf.ch/' ,'https://www.hirslanden.ch/de/klinik-belair/home.html' ,'https://klinik-schloss-mammern.ch/de/' ,'https://www.upd.ch/' ,'https://www.solothurnerspitaeler.ch/unsere-spitaeler/buergerspital-solothurn/' ,'https://www.klinik-gais.ch/home/' ,'https://www.usz.ch/standorte/usz-flughafen/' ,'https://www.herz-zentrum.com/muensterlingen/klinik/standorte' ,'https://www.ukbb.ch/' ,'https://www.hopital-broye.ch' ,'https://www.luks.ch/standorte/standort-luzern/kinderspital' ,'https://www.solothurnerspitaeler.ch/unsere-spitaeler/spital-dornach/' ,'https://www.zgks.ch/' ,'https://www.lups.ch/erwachsenen-psychiatrie/allgemeinpsychiatrie/kliniken/' ,'https://www.lups.ch' ,'https://www.spitalaffoltern.ch' ,'https://www.chuv.ch/' ,'https://www.spitalbuelach.ch/' ,'https://www.spitalfmi.ch/index.php?id=2217' ,'https://www.spitaeler-sh.ch/Patienten-Besucher/Kantonsspital-SH/index.php' ,'https://www.andreasklinik.ch/' ,'https://www.siloah.ch' ,'http://www.st-anna.ch' ,'https://www.spitalzentrum-biel.ch' ,'https://www.clienia.ch/' ,'https://www.kispi.uzh.ch/' ,'https://www.solothurnerspitaeler.ch/unsere-spitaeler/kantonsspital-olten/' ,'https://www.ksow.ch/']
|
| 67 |
+
if __name__ == "__main__":
|
| 68 |
+
|
| 69 |
+
with Pool(processes=20) as pool:
|
| 70 |
+
pool.map(crawl_bfs_single, urls)
|
| 71 |
+
print("✅ Crawled all the pages")
|
| 72 |
+
|
| 73 |
+
|
summarize.py
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from urllib import response
|
| 3 |
+
import requests
|
| 4 |
+
from tqdm import tqdm
|
| 5 |
+
from openai import OpenAI
|
| 6 |
+
|
| 7 |
+
parent = '/capstor/scratch/cscs/dshah/AI/CleanedHospitalData'
|
| 8 |
+
openrouterapi = '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'
|
| 9 |
+
# 'sk-2defa7c8af874651a32f6ce5c4eb2675'
|
| 10 |
+
|
| 11 |
+
def get_from_or(file_path):
|
| 12 |
+
if file_path.endswith('.json'):
|
| 13 |
+
file_path = os.path.join(parent, file_path)
|
| 14 |
+
else:
|
| 15 |
+
return 'error'
|
| 16 |
+
output_file = file_path.replace('CleanedHospitalData', 'SummarizedCleanedHospitalData')
|
| 17 |
+
if os.path.exists(output_file):
|
| 18 |
+
return 'error'
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
with open(file_path, 'r') as file:
|
| 22 |
+
content = file.read()
|
| 23 |
+
client = OpenAI(
|
| 24 |
+
base_url="https://openrouter.ai/api/v1",
|
| 25 |
+
api_key=openrouterapi,
|
| 26 |
+
)
|
| 27 |
+
|
| 28 |
+
completion = client.chat.completions.create(
|
| 29 |
+
# model="deepseek/deepseek-chat-v3.1:free",
|
| 30 |
+
model="x-ai/grok-4-fast:free",
|
| 31 |
+
|
| 32 |
+
# messages= [
|
| 33 |
+
# {
|
| 34 |
+
# 'role': 'user',
|
| 35 |
+
# 'content': content,
|
| 36 |
+
# },
|
| 37 |
+
# {'role': 'user', 'content': 'Summarize the above text with Hospital name as the heading and in key word format. Make it as precise as possible and do not assume anything apart from the text. Give me the keywords related to the technologies that are offered by the hospital, The specialties they have and the services they provide. Ignore writing everything else. Do not write any warnings or disclaimers, only useful text.' }
|
| 38 |
+
# ],
|
| 39 |
+
# )
|
| 40 |
+
messages= [
|
| 41 |
+
{
|
| 42 |
+
'role': 'user',
|
| 43 |
+
'content': [{'type': 'text', 'text': content}],
|
| 44 |
+
},
|
| 45 |
+
{'role': 'user', 'content': [{'type': 'text', 'text': 'Summarize the above text with Hospital name as the heading and in key word format. Make it as precise as possible and do not assume anything apart from the text. Give me the keywords related to the technologies that are offered by the hospital, The specialties they have and the services they provide. Ignore writing everything else. Do not write any warnings or disclaimers, only useful text.'}] }
|
| 46 |
+
],
|
| 47 |
+
)
|
| 48 |
+
try:
|
| 49 |
+
output = (completion.choices[0].message.content)
|
| 50 |
+
with open(output_file, 'w') as out_file:
|
| 51 |
+
out_file.write(output)
|
| 52 |
+
return output
|
| 53 |
+
except Exception as e:
|
| 54 |
+
print(f"Error occurred: {e}")
|
| 55 |
+
return 'error'
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
headers = {
|
| 59 |
+
'Authorization': 'Bearer ' + 'yae32JuX8lZGeTEWKSMqHjcNlTa0',
|
| 60 |
+
'Content-Type': 'application/json',
|
| 61 |
+
}
|
| 62 |
+
def get_json_for_file(file_path):
|
| 63 |
+
with open(file_path, 'r') as file:
|
| 64 |
+
content = file.read()
|
| 65 |
+
return {
|
| 66 |
+
'model': 'swiss-ai/Apertus-70B',
|
| 67 |
+
'messages': [
|
| 68 |
+
{
|
| 69 |
+
'role': 'user',
|
| 70 |
+
'content': content,
|
| 71 |
+
},
|
| 72 |
+
{'role': 'user', 'content': 'Summarize the above text in key word format. Make it as precise as possible and do not assume anything apart from the text. Give me the keywords related to the technologies that are offered by the hospital, The specialties they have and the services they provide. Ignore writing everything else. Do not write any warnings or disclaimers, only useful text.' }
|
| 73 |
+
],
|
| 74 |
+
}
|
| 75 |
+
# json_data = {
|
| 76 |
+
# 'model': 'swiss-ai/Apertus-70B',
|
| 77 |
+
# 'messages': [
|
| 78 |
+
# {
|
| 79 |
+
# 'role': 'user',
|
| 80 |
+
# 'content': 'Hello, how are you?',
|
| 81 |
+
# },
|
| 82 |
+
# ],
|
| 83 |
+
# }
|
| 84 |
+
# for file in tqdm(os.listdir(parent)):
|
| 85 |
+
files = os.listdir(parent)
|
| 86 |
+
import multiprocessing as mp
|
| 87 |
+
if __name__ == '__main__':
|
| 88 |
+
|
| 89 |
+
with mp.Pool(processes=100) as pool:
|
| 90 |
+
pool.map(get_from_or, files)
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
# json_data = get_json_for_file(file_path)
|
| 94 |
+
# response = requests.post(
|
| 95 |
+
# 'https://api.swisscom.com/layer/swiss-ai-weeks/apertus-70b/v1/chat/completions',
|
| 96 |
+
# headers=headers,
|
| 97 |
+
# json=json_data,
|
| 98 |
+
# )
|
| 99 |
+
# with open(output_file, 'w') as out_file:
|
| 100 |
+
# out_file.write(response.text)
|
| 101 |
+
|
| 102 |
+
# response_text = get_from_or(file_path)
|
| 103 |
+
# print(response.json())
|
| 104 |
+
|
| 105 |
+
|