Eleonora Bernasconi commited on
Commit
6a03357
·
1 Parent(s): abef483

Add application file

Browse files
__pycache__/filterDataframe.cpython-37.pyc ADDED
Binary file (1.89 kB). View file
 
__pycache__/scholarly.cpython-37.pyc ADDED
Binary file (633 Bytes). View file
 
app.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import scholarly
4
+
5
+ st.title("CSV Data Viewer")
6
+
7
+ def load_data():
8
+ data = pd.read_csv("data.csv", sep=";", usecols=range(10))
9
+ return data
10
+
11
+ data = load_data()
12
+
13
+ # Display the data
14
+ st.write("Data from CSV:")
15
+ st.write(data)
16
+
17
+ cit_array = []
18
+
19
+ if 'doi' not in data.columns:
20
+ st.write("The 'doi' column does not exist in the CSV.")
21
+ else:
22
+ # Loop over DOIs and retrieve citation counts
23
+ for index, row in data.iterrows():
24
+ doi = row['doi']
25
+ if doi:
26
+ # st.text(f"1 Extracting DOI: {doi}")
27
+ citation_count = scholarly.get_citation_count(doi)
28
+ if citation_count != None:
29
+ cit_array.append(citation_count)
30
+ st.text(f"DOI: {doi}, Citation Count: {citation_count}")
31
+ else:
32
+ # Handle cases where DOI is None (e.g., bytitle lookup)
33
+ title = row['title']
34
+ doi_bytitle = scholarly.get_doi_from_title(str(title))
35
+ # st.text(title)
36
+ # st.text(f"Extracting DOI from Title: {title}")
37
+ citation_count_title = scholarly.get_citation_count(doi_bytitle)
38
+ cit_array.append(citation_count_title)
39
+ st.text(f"DOI from Title: {title}, Citation Count: {citation_count_title}")
40
+ # else:
41
+ # cit_array.append(None)
42
+
43
+ # Add the citation count column to the DataFrame
44
+ data['Citation Count'] = cit_array
45
+ st.write(data)
46
+ if not data.empty:
47
+ st.download_button(
48
+ label="Download Filtered Data as CSV",
49
+ data=data.to_csv(index=False).encode(),
50
+ file_name="filtered_data.csv",
51
+ key="download_filtered_data",
52
+ )
data.csv ADDED
The diff for this file is too large to render. See raw diff
 
data.xlsx ADDED
Binary file (62.8 kB). View file
 
filterDataframe.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import streamlit as st
3
+ from pandas.api.types import (
4
+ is_categorical_dtype,
5
+ is_datetime64_any_dtype,
6
+ is_numeric_dtype,
7
+ is_object_dtype,
8
+ )
9
+
10
+ def filter_dataframe(df: pd.DataFrame) -> pd.DataFrame:
11
+ """
12
+ Adds a UI on top of a dataframe to let viewers filter columns
13
+
14
+ Args:
15
+ df (pd.DataFrame): Original dataframe
16
+
17
+ Returns:
18
+ pd.DataFrame: Filtered dataframe
19
+ """
20
+ modify = st.checkbox("Add filters")
21
+
22
+ if not modify:
23
+ return df
24
+
25
+ df = df.copy()
26
+
27
+ # Try to convert datetimes into a standard format (datetime, no timezone)
28
+ for col in df.columns:
29
+ if is_object_dtype(df[col]):
30
+ try:
31
+ df[col] = pd.to_datetime(df[col])
32
+ except Exception:
33
+ pass
34
+
35
+ if is_datetime64_any_dtype(df[col]):
36
+ df[col] = df[col].dt.tz_localize(None)
37
+
38
+ modification_container = st.container()
39
+
40
+ with modification_container:
41
+ to_filter_columns = st.multiselect("Filter dataframe on", df.columns)
42
+ for column in to_filter_columns:
43
+ left, right = st.columns((1, 20))
44
+ left.write("↳")
45
+ # Treat columns with < 10 unique values as categorical
46
+ if is_categorical_dtype(df[column]) or df[column].nunique() < 10:
47
+ user_cat_input = right.multiselect(
48
+ f"Values for {column}",
49
+ df[column].unique(),
50
+ default=list(df[column].unique()),
51
+ )
52
+ df = df[df[column].isin(user_cat_input)]
53
+ elif is_numeric_dtype(df[column]):
54
+ _min = float(df[column].min())
55
+ _max = float(df[column].max())
56
+ step = (_max - _min) / 100
57
+ user_num_input = right.slider(
58
+ f"Values for {column}",
59
+ _min,
60
+ _max,
61
+ (_min, _max),
62
+ step=step,
63
+ )
64
+ df = df[df[column].between(*user_num_input)]
65
+ elif is_datetime64_any_dtype(df[column]):
66
+ user_date_input = right.date_input(
67
+ f"Values for {column}",
68
+ value=(
69
+ df[column].min(),
70
+ df[column].max(),
71
+ ),
72
+ )
73
+ if len(user_date_input) == 2:
74
+ user_date_input = tuple(map(pd.to_datetime, user_date_input))
75
+ start_date, end_date = user_date_input
76
+ df = df.loc[df[column].between(start_date, end_date)]
77
+ else:
78
+ user_text_input = right.text_input(
79
+ f"Substring or regex in {column}",
80
+ )
81
+ if user_text_input:
82
+ df = df[df[column].str.contains(user_text_input)]
83
+
84
+ return df
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ scholarly
2
+ habanero
scholarly.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from habanero import counts
2
+ from habanero import Crossref
3
+
4
+
5
+
6
+ def get_citation_count(doi):
7
+ try:
8
+ cit = counts.citation_count(doi = doi)
9
+ return str(cit)
10
+ except Exception as e:
11
+ # print(f"Error fetching data for DOI {doi}: {e}")
12
+ return None
13
+
14
+ def get_doi_from_title(title):
15
+ cr = Crossref()
16
+ result = cr.works(query = title)
17
+ return (result['message']['items'][0]['DOI'])