Eleonora Bernasconi commited on
Commit
ba3cdcb
·
1 Parent(s): a1a8bdc
Files changed (1) hide show
  1. app.py +138 -58
app.py CHANGED
@@ -1,66 +1,146 @@
1
  import streamlit as st
2
- import pandas as pd
3
- import scholarly
4
 
5
- st.title("CSV Data Viewer")
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
- def load_data():
8
- data = pd.read_csv("data.csv", sep=";", usecols=range(10), encoding="latin1")
9
- return data
 
 
 
10
 
11
- data = load_data()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
- # Display the data
14
- st.write("Data from CSV:")
15
- st.write(data)
16
 
17
- cit_array = []
18
- count = 0
19
- st.write(count)
20
 
21
- # Iterate over rows and update 'doi' column if necessary
22
- for index, row in data.iterrows():
23
- doi = row['doi']
24
- title = row['title']
 
 
 
 
 
 
 
 
 
25
 
26
- # If 'doi' is None, attempt to get DOI from title
27
- if pd.isnull(doi):
28
- doi = scholarly.get_doi_from_title(title)
29
- # Update the DataFrame with the retrieved DOI
30
- if doi:
31
- data.at[index, 'doi'] = doi
32
-
33
- # Display the updated data table
34
- st.write("Data with DOI")
35
- st.write(data)
36
-
37
- # Loop over DOIs and retrieve citation counts
38
- for index, row in data.iterrows():
39
- doi = row['doi']
40
- if doi:
41
- citation_count = scholarly.get_citation_count(doi)
42
- if citation_count != None:
43
- cit_array.append(citation_count)
44
- st.text(f"DOI: {doi}, Citation Count: {citation_count}")
45
- count += 1
46
- else:
47
- # Handle cases where DOI is None (e.g., bytitle lookup)
48
- title = row['title']
49
- doi_bytitle = scholarly.get_doi_from_title(str(title))
50
- citation_count_title = scholarly.get_citation_count(doi_bytitle)
51
- if citation_count_title != None:
52
- count += 1
53
- st.text(f"DOI from Title: {title}, Citation Count: {citation_count_title}")
54
- cit_array.append(citation_count_title)
55
-
56
-
57
- # Add the citation count column to the DataFrame
58
- data['Citation Count'] = cit_array
59
- st.write(data)
60
- if not data.empty:
61
- st.download_button(
62
- label="Download Filtered Data as CSV",
63
- data=data.to_csv(index=False).encode(),
64
- file_name="filtered_data.csv",
65
- key="download_filtered_data",
66
- )
 
1
  import streamlit as st
 
 
2
 
3
+ def knowledge_extraction():
4
+ import streamlit as st
5
+ import pandas as pd
6
+ import scholarly
7
+ st.title("CSV Data Viewer")
8
+ def load_data():
9
+ data = pd.read_csv("data.csv", sep=";", usecols=range(10), encoding="latin1")
10
+ return data
11
+ data = load_data()
12
+ # Step 1: Display the data retrieved from DBLP
13
+ step_1 = st.sidebar.checkbox("1 - Display the data retrieved from DBLP (Digital Bibliography & Library Project)")
14
+ if step_1:
15
+ st.write("Data from DBLP:")
16
+ st.write(data)
17
 
18
+ # Step 2: Iterate over rows and update 'doi' column if necessary
19
+ step_2 = st.sidebar.checkbox("2 - Iterate over rows and update 'doi' column if necessary")
20
+ if step_2:
21
+ cit_array = []
22
+ count = 0
23
+ st.write(count)
24
 
25
+ # Iterate over rows and update 'doi' column if necessary
26
+ for index, row in data.iterrows():
27
+ doi = row['doi']
28
+ title = row['title']
29
+
30
+ # If 'doi' is None, attempt to get DOI from title
31
+ if pd.isnull(doi):
32
+ doi = scholarly.get_doi_from_title(title)
33
+ # Update the DataFrame with the retrieved DOI
34
+ if doi:
35
+ data.at[index, 'doi'] = doi
36
+
37
+ # Display the updated data table
38
+ st.write("Data with DOI")
39
+ st.write(data)
40
+
41
+ # Step 3: Loop over DOIs and retrieve citation counts
42
+ step_3 = st.sidebar.checkbox("3 - Loop over DOIs and retrieve citation counts")
43
+ if step_3:
44
+ cit_array = []
45
+ count = 0
46
+
47
+ # Loop over DOIs and retrieve citation counts
48
+ for index, row in data.iterrows():
49
+ doi = row['doi']
50
+ if doi:
51
+ citation_count = scholarly.get_citation_count(doi)
52
+ if citation_count is not None:
53
+ cit_array.append(citation_count)
54
+ st.text(f"DOI: {doi}, Citation Count: {citation_count}")
55
+ count += 1
56
+ else:
57
+ # Handle cases where DOI is None (e.g., bytitle lookup)
58
+ title = row['title']
59
+ doi_bytitle = scholarly.get_doi_from_title(str(title))
60
+ citation_count_title = scholarly.get_citation_count(doi_bytitle)
61
+ if citation_count_title is not None:
62
+ count += 1
63
+ st.text(f"DOI from Title: {title}, Citation Count: {citation_count_title}")
64
+ cit_array.append(citation_count_title)
65
+
66
+ # Add the citation count column to the DataFrame
67
+ data['Citation Count'] = cit_array
68
+ st.write(data)
69
+
70
+ # Step 4: Download Filtered Data as CSV
71
+ if not data.empty and step_3:
72
+ st.download_button(
73
+ label="Download Filtered Data as CSV",
74
+ data=data.to_csv(index=False).encode(),
75
+ file_name="filtered_data.csv",
76
+ key="download_filtered_data",
77
+ )
78
+
79
+ def analysis():
80
+
81
+ import streamlit as st
82
+ import pandas as pd
83
+ import matplotlib.pyplot as plt
84
+ import seaborn as sns
85
+
86
+ st.write("# Analysis of knowledge")
87
+
88
+ def load_data():
89
+ data = pd.read_csv("output/output_crawled_data.csv", sep=",", encoding="latin1")
90
+ return data
91
+ data = load_data()
92
+ # Step 1: Display the data retrieved from DBLP
93
+ step_1 = st.sidebar.checkbox("1 - Display the crawled data")
94
+ if step_1:
95
+ st.write("Crawed data:")
96
+ st.write(data)
97
+
98
+ # Raggruppa i dati per autore e somma le citazioni per ciascun autore
99
+ authors_citation_counts = data.groupby('authors')['Citation Count'].sum().reset_index()
100
+
101
+ # Ordina gli autori per citazioni decrescenti
102
+ top_authors = authors_citation_counts.sort_values(by='Citation Count', ascending=False)
103
 
104
+ # Trova la lista di autori con il massimo numero di citazioni
105
+ max_citations = top_authors.iloc[0]['Citation Count']
106
+ top_citation_authors = top_authors[top_authors['Citation Count'] == max_citations]['authors']
107
 
108
+ # Trova la lista dei paper più citati
109
+ top_papers = data.sort_values(by='Citation Count', ascending=False).head(10)
 
110
 
111
+ # Trova gli anni in cui i paper hanno ottenuto più citazioni
112
+ years_with_most_citations = data.groupby('year')['Citation Count'].sum().reset_index()
113
+ years_with_most_citations = years_with_most_citations.sort_values(by='Citation Count', ascending=False).head(5)
114
+
115
+ # Visualizza i risultati
116
+ st.write("Lista di autori con il massimo numero di citazioni:")
117
+ st.write(top_citation_authors.to_string(index=False))
118
+
119
+ st.write("\nLista dei paper più citati:")
120
+ st.write(top_papers[['title', 'Citation Count']].to_string(index=False))
121
+
122
+ st.write("\nAnni in cui i paper hanno ottenuto più citazioni:")
123
+ st.write(years_with_most_citations.to_string(index=False))
124
 
125
+
126
+ def intro():
127
+ import streamlit as st
128
+
129
+ st.write("# IRCDL Conference: A Two-Decade Bibliometric Journey Through Digital Libraries Research")
130
+ st.sidebar.success("Select a phase")
131
+
132
+ st.markdown(
133
+ """
134
+ IRCDL site: https://ircdl2024.dei.unipd.it/
135
+ """
136
+ )
137
+
138
+
139
+ page_names_to_funcs = {
140
+ "Welcome": intro,
141
+ "Knowledge Extraction": knowledge_extraction,
142
+ "Analysis": analysis,
143
+ }
144
+
145
+ demo_name = st.sidebar.selectbox("Choose a phase", page_names_to_funcs.keys())
146
+ page_names_to_funcs[demo_name]()