Spaces:
Runtime error
Runtime error
| import streamlit as st | |
| import pandas as pd | |
| import xml.etree.ElementTree as ET | |
| import csv | |
| from fuzzywuzzy import fuzz | |
| from fuzzywuzzy import process | |
| from tqdm import tqdm | |
| # Funzione per calcolare il punteggio di similarità tra due stringhe | |
| def similarity(string1, string2): | |
| return fuzz.ratio(string1.lower(), string2.lower()) | |
| # Function to merge and enrich datasets | |
| def merge_and_enrich_datasets(scholar_df, semantics_df, dblp_df): | |
| # Unisci i record in base al titolo con similarità superiore all'80% | |
| merged_records = [] | |
| # Calcola il totale dei record da elaborare per la barra di avanzamento | |
| total_records = len(dblp_df) | |
| # with open('outputMerged.csv', mode='a', newline='', encoding='utf-8') as file: | |
| # Utilizza tqdm per mostrare la barra di avanzamento | |
| with tqdm(total=total_records, desc="Unione dei record") as pbar: | |
| for _, dblp_row in dblp_df.iterrows(): | |
| dblp_title = dblp_row['Title'] | |
| # Trova record simili nei DataFrames Scholar e Semantics | |
| scholar_df['Title'] = scholar_df['Title'].astype(str) | |
| semantics_df['Title'] = semantics_df['Title'].astype(str) | |
| scholar_matches = scholar_df[scholar_df['Title'].apply(lambda x: similarity(str(dblp_title), str(x))) > 80] | |
| semantics_matches = semantics_df[semantics_df['Title'].apply(lambda x: similarity(str(dblp_title), str(x))) > 80] | |
| try: | |
| merged_record = dblp_row.copy() | |
| if not scholar_matches.empty: | |
| # Aggiungi solo le colonne 'doi' e 'citazioni' da scholar_matches | |
| merged_record['doi_scholar'] = scholar_matches.iloc[0]['DOI'] | |
| merged_record['cites_scholar'] = scholar_matches.iloc[0]['Cites'] | |
| merged_record['abstract_scholar'] = scholar_matches.iloc[0]['Abstract'] | |
| if not semantics_matches.empty: | |
| # Aggiungi solo le colonne 'doi' e 'citazioni' da semantics_matches | |
| merged_record['doi_semantic'] = semantics_matches.iloc[0]['DOI'] | |
| merged_record['cites_semantic'] = semantics_matches.iloc[0]['Cites'] | |
| merged_record['abstract_scholar'] = 'None' | |
| merged_records.append(merged_record) | |
| # Scrivi il record nel file CSV | |
| # file.write(merged_record.to_csv(index=False, sep=';', encoding='utf-8') + '\n') | |
| except Exception as e: | |
| print(f"Errore durante l'unione dei record: {str(e)}") | |
| continue | |
| # Aggiorna la barra di avanzamento | |
| pbar.update(1) | |
| # Crea un DataFrame finale con i record uniti | |
| final_df = pd.DataFrame(merged_records) | |
| final_df.to_csv('output/scholar_dblp_semantics.csv', index=False, sep=';' , encoding='utf-8') | |
| return final_df | |
| def main(): | |
| # Interfaccia utente Streamlit | |
| st.title("Knowledge enrichment") | |
| if st.button("create enriched dataset"): | |
| # Carica i file CSV in DataFrame | |
| scholar_df = pd.read_csv("output/googleScholarcsv.csv", sep=";", encoding='utf-8') | |
| semantics_df = pd.read_csv("output/semanticscholarcsv.csv", sep=";", encoding='utf-8') | |
| dblp_df = pd.read_csv("output/crawler_doi_citation.csv", sep=";", encoding='utf-8') | |
| # Call the merge_and_enrich_datasets function | |
| enriched_dataset = merge_and_enrich_datasets(scholar_df, semantics_df, dblp_df) | |
| # Display the enriched dataset | |
| st.write("Enriched Dataset:") | |
| st.dataframe(enriched_dataset) | |