Spaces:
Runtime error
Runtime error
File size: 3,623 Bytes
ea361ad 840795f ea361ad 840795f ea361ad 840795f ea361ad 840795f ea361ad ce81144 438c3c8 ea361ad 249208a ea361ad 716b2bd ea361ad ce81144 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 | import streamlit as st
import pandas as pd
import xml.etree.ElementTree as ET
import csv
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from tqdm import tqdm
# Funzione per calcolare il punteggio di similarità tra due stringhe
def similarity(string1, string2):
return fuzz.ratio(string1.lower(), string2.lower())
# Function to merge and enrich datasets
def merge_and_enrich_datasets(scholar_df, semantics_df, dblp_df):
# Unisci i record in base al titolo con similarità superiore all'80%
merged_records = []
# Calcola il totale dei record da elaborare per la barra di avanzamento
total_records = len(dblp_df)
# with open('outputMerged.csv', mode='a', newline='', encoding='utf-8') as file:
# Utilizza tqdm per mostrare la barra di avanzamento
with tqdm(total=total_records, desc="Unione dei record") as pbar:
for _, dblp_row in dblp_df.iterrows():
dblp_title = dblp_row['Title']
# Trova record simili nei DataFrames Scholar e Semantics
scholar_df['Title'] = scholar_df['Title'].astype(str)
semantics_df['Title'] = semantics_df['Title'].astype(str)
scholar_matches = scholar_df[scholar_df['Title'].apply(lambda x: similarity(str(dblp_title), str(x))) > 80]
semantics_matches = semantics_df[semantics_df['Title'].apply(lambda x: similarity(str(dblp_title), str(x))) > 80]
try:
merged_record = dblp_row.copy()
if not scholar_matches.empty:
# Aggiungi solo le colonne 'doi' e 'citazioni' da scholar_matches
merged_record['doi_scholar'] = scholar_matches.iloc[0]['DOI']
merged_record['cites_scholar'] = scholar_matches.iloc[0]['Cites']
merged_record['abstract_scholar'] = scholar_matches.iloc[0]['Abstract']
if not semantics_matches.empty:
# Aggiungi solo le colonne 'doi' e 'citazioni' da semantics_matches
merged_record['doi_semantic'] = semantics_matches.iloc[0]['DOI']
merged_record['cites_semantic'] = semantics_matches.iloc[0]['Cites']
merged_record['abstract_scholar'] = 'None'
merged_records.append(merged_record)
# Scrivi il record nel file CSV
# file.write(merged_record.to_csv(index=False, sep=';', encoding='utf-8') + '\n')
except Exception as e:
print(f"Errore durante l'unione dei record: {str(e)}")
continue
# Aggiorna la barra di avanzamento
pbar.update(1)
# Crea un DataFrame finale con i record uniti
final_df = pd.DataFrame(merged_records)
final_df.to_csv('output/scholar_dblp_semantics.csv', index=False, sep=';' , encoding='utf-8')
return final_df
def main():
# Interfaccia utente Streamlit
st.title("Knowledge enrichment")
if st.button("create enriched dataset"):
# Carica i file CSV in DataFrame
scholar_df = pd.read_csv("output/googleScholarcsv.csv", sep=";", encoding='utf-8')
semantics_df = pd.read_csv("output/semanticscholarcsv.csv", sep=";", encoding='utf-8')
dblp_df = pd.read_csv("output/crawler_doi_citation.csv", sep=";", encoding='utf-8')
# Call the merge_and_enrich_datasets function
enriched_dataset = merge_and_enrich_datasets(scholar_df, semantics_df, dblp_df)
# Display the enriched dataset
st.write("Enriched Dataset:")
st.dataframe(enriched_dataset)
|