ircdl_data / merge.py
Eleonora Bernasconi
updates
ce81144
import streamlit as st
import pandas as pd
import xml.etree.ElementTree as ET
import csv
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from tqdm import tqdm
# Funzione per calcolare il punteggio di similarità tra due stringhe
def similarity(string1, string2):
return fuzz.ratio(string1.lower(), string2.lower())
# Function to merge and enrich datasets
def merge_and_enrich_datasets(scholar_df, semantics_df, dblp_df):
# Unisci i record in base al titolo con similarità superiore all'80%
merged_records = []
# Calcola il totale dei record da elaborare per la barra di avanzamento
total_records = len(dblp_df)
# with open('outputMerged.csv', mode='a', newline='', encoding='utf-8') as file:
# Utilizza tqdm per mostrare la barra di avanzamento
with tqdm(total=total_records, desc="Unione dei record") as pbar:
for _, dblp_row in dblp_df.iterrows():
dblp_title = dblp_row['Title']
# Trova record simili nei DataFrames Scholar e Semantics
scholar_df['Title'] = scholar_df['Title'].astype(str)
semantics_df['Title'] = semantics_df['Title'].astype(str)
scholar_matches = scholar_df[scholar_df['Title'].apply(lambda x: similarity(str(dblp_title), str(x))) > 80]
semantics_matches = semantics_df[semantics_df['Title'].apply(lambda x: similarity(str(dblp_title), str(x))) > 80]
try:
merged_record = dblp_row.copy()
if not scholar_matches.empty:
# Aggiungi solo le colonne 'doi' e 'citazioni' da scholar_matches
merged_record['doi_scholar'] = scholar_matches.iloc[0]['DOI']
merged_record['cites_scholar'] = scholar_matches.iloc[0]['Cites']
merged_record['abstract_scholar'] = scholar_matches.iloc[0]['Abstract']
if not semantics_matches.empty:
# Aggiungi solo le colonne 'doi' e 'citazioni' da semantics_matches
merged_record['doi_semantic'] = semantics_matches.iloc[0]['DOI']
merged_record['cites_semantic'] = semantics_matches.iloc[0]['Cites']
merged_record['abstract_scholar'] = 'None'
merged_records.append(merged_record)
# Scrivi il record nel file CSV
# file.write(merged_record.to_csv(index=False, sep=';', encoding='utf-8') + '\n')
except Exception as e:
print(f"Errore durante l'unione dei record: {str(e)}")
continue
# Aggiorna la barra di avanzamento
pbar.update(1)
# Crea un DataFrame finale con i record uniti
final_df = pd.DataFrame(merged_records)
final_df.to_csv('output/scholar_dblp_semantics.csv', index=False, sep=';' , encoding='utf-8')
return final_df
def main():
# Interfaccia utente Streamlit
st.title("Knowledge enrichment")
if st.button("create enriched dataset"):
# Carica i file CSV in DataFrame
scholar_df = pd.read_csv("output/googleScholarcsv.csv", sep=";", encoding='utf-8')
semantics_df = pd.read_csv("output/semanticscholarcsv.csv", sep=";", encoding='utf-8')
dblp_df = pd.read_csv("output/crawler_doi_citation.csv", sep=";", encoding='utf-8')
# Call the merge_and_enrich_datasets function
enriched_dataset = merge_and_enrich_datasets(scholar_df, semantics_df, dblp_df)
# Display the enriched dataset
st.write("Enriched Dataset:")
st.dataframe(enriched_dataset)