import streamlit as st import pandas as pd import xml.etree.ElementTree as ET import csv from fuzzywuzzy import fuzz from fuzzywuzzy import process from tqdm import tqdm # Funzione per calcolare il punteggio di similarità tra due stringhe def similarity(string1, string2): return fuzz.ratio(string1.lower(), string2.lower()) # Function to merge and enrich datasets def merge_and_enrich_datasets(scholar_df, semantics_df, dblp_df): # Unisci i record in base al titolo con similarità superiore all'80% merged_records = [] # Calcola il totale dei record da elaborare per la barra di avanzamento total_records = len(dblp_df) # with open('outputMerged.csv', mode='a', newline='', encoding='utf-8') as file: # Utilizza tqdm per mostrare la barra di avanzamento with tqdm(total=total_records, desc="Unione dei record") as pbar: for _, dblp_row in dblp_df.iterrows(): dblp_title = dblp_row['Title'] # Trova record simili nei DataFrames Scholar e Semantics scholar_df['Title'] = scholar_df['Title'].astype(str) semantics_df['Title'] = semantics_df['Title'].astype(str) scholar_matches = scholar_df[scholar_df['Title'].apply(lambda x: similarity(str(dblp_title), str(x))) > 80] semantics_matches = semantics_df[semantics_df['Title'].apply(lambda x: similarity(str(dblp_title), str(x))) > 80] try: merged_record = dblp_row.copy() if not scholar_matches.empty: # Aggiungi solo le colonne 'doi' e 'citazioni' da scholar_matches merged_record['doi_scholar'] = scholar_matches.iloc[0]['DOI'] merged_record['cites_scholar'] = scholar_matches.iloc[0]['Cites'] merged_record['abstract_scholar'] = scholar_matches.iloc[0]['Abstract'] if not semantics_matches.empty: # Aggiungi solo le colonne 'doi' e 'citazioni' da semantics_matches merged_record['doi_semantic'] = semantics_matches.iloc[0]['DOI'] merged_record['cites_semantic'] = semantics_matches.iloc[0]['Cites'] merged_record['abstract_scholar'] = 'None' merged_records.append(merged_record) # Scrivi il record nel file CSV # file.write(merged_record.to_csv(index=False, sep=';', encoding='utf-8') + '\n') except Exception as e: print(f"Errore durante l'unione dei record: {str(e)}") continue # Aggiorna la barra di avanzamento pbar.update(1) # Crea un DataFrame finale con i record uniti final_df = pd.DataFrame(merged_records) final_df.to_csv('output/scholar_dblp_semantics.csv', index=False, sep=';' , encoding='utf-8') return final_df def main(): # Interfaccia utente Streamlit st.title("Knowledge enrichment") if st.button("create enriched dataset"): # Carica i file CSV in DataFrame scholar_df = pd.read_csv("output/googleScholarcsv.csv", sep=";", encoding='utf-8') semantics_df = pd.read_csv("output/semanticscholarcsv.csv", sep=";", encoding='utf-8') dblp_df = pd.read_csv("output/crawler_doi_citation.csv", sep=";", encoding='utf-8') # Call the merge_and_enrich_datasets function enriched_dataset = merge_and_enrich_datasets(scholar_df, semantics_df, dblp_df) # Display the enriched dataset st.write("Enriched Dataset:") st.dataframe(enriched_dataset)