File size: 4,719 Bytes
ea361ad
438c3c8
 
ea361ad
 
 
 
 
438c3c8
ea361ad
438c3c8
ea361ad
438c3c8
ea361ad
 
438c3c8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ea361ad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
716b2bd
 
ea361ad
 
716b2bd
ea361ad
 
 
716b2bd
ea361ad
 
 
716b2bd
 
 
ea361ad
 
716b2bd
ea361ad
 
 
 
 
 
 
 
716b2bd
ea361ad
 
 
 
 
 
 
 
716b2bd
ea361ad
 
 
 
 
 
 
 
 
 
716b2bd
 
ea361ad
 
 
 
 
 
716b2bd
 
ea361ad
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import streamlit as st
import xml.etree.ElementTree as ET
import csv

def knowledge_extraction():
    import streamlit as st
    import pandas as pd
    import scholarly
    
    st.title("CSV Data Viewer")
    
    def load_data():
        data = pd.read_csv("output/dblp.csv", sep=";", encoding="latin1")
        return data
    data = load_data()

    if st.button("generate dblp csv"):
        # Carica il file XML
        tree = ET.parse('output/dblp.xml')
        root = tree.getroot()
        # Crea un file CSV per scrivere i dati
        with open('output/dblp.csv', 'w', newline='', encoding='utf-8') as csvfile:
            csvwriter = csv.writer(csvfile, delimiter=';')

            # Scrivi l'intestazione del CSV
            csvwriter.writerow(['DBLP', 'Authors', 'Title', 'Year', 'DOI', 'URL'])

            # Estrai i dati XML e scrivili nel CSV
            for hit in root.findall('.//hit'):
                authors = ', '.join(author.text for author in hit.findall('.//authors/author'))
                title = hit.find('.//title').text
                year = hit.find('.//year').text
                doi_element = hit.find('.//doi')
                doi = doi_element.text if doi_element is not None else 'None'
                url = hit.find('.//url').text
                id = hit.attrib['id']
                

                # Scrivi i dati nel file CSV
                csvwriter.writerow([id, authors, title, year, doi, url])

        print("Conversione XML in CSV completata.")

    count_none = (data['DOI'] == 'None').sum()
    total_values = len(data['DOI'])
    st.write("Number of values in DOI column equal to None / Total values: " + str(count_none) + "/" + str(total_values))

    # Step 1: Display the data retrieved from DBLP
    step_1 = st.sidebar.checkbox("1 - Display the data retrieved from DBLP (Digital Bibliography & Library Project)")
    if step_1:
        st.write("Data from DBLP:")
        st.write(data)

    # Step 2: Iterate over rows and update 'doi' column if necessary
    step_2 = st.sidebar.checkbox("2 - Iterate over rows and update 'doi' column if necessary")
    if step_2:
        cit_array = []
        count = 0
        st.write(count)

        # Iterate over rows and update 'doi' column if necessary
        for index, row in data.iterrows():
            doi = row['DOI']
            title = row['Title']
            
            # If 'doi' is None, attempt to get DOI from title
            if doi == 'None':
                doi = scholarly.get_doi_from_title(title)
                # Update the DataFrame with the retrieved DOI
                if doi:
                    data.at[index, 'DOI'] = doi

        # Display the updated data table
        st.write("Data with DOI")
        count_none = (data['DOI'] == 'None').sum()
        total_values = len(data['DOI'])
        st.write("Number of values in DOI column equal to None / Total values: " + str(count_none) + "/" + str(total_values))
        st.write(data)


    # Step 3: Loop over DOIs and retrieve citation counts
    step_3 = st.sidebar.checkbox("3 - Loop over DOIs and retrieve citation counts")
    if step_3:
        cit_array = []
        count = 0

        # Loop over DOIs and retrieve citation counts
        for index, row in data.iterrows():
            doi = row['DOI']
            if doi:
                citation_count = scholarly.get_citation_count(doi)
                if citation_count is not None:
                    cit_array.append(citation_count)
                    st.text(f"DOI: {doi}, Citation Count: {citation_count}")
                    count += 1
                else:
                    # Handle cases where DOI is None (e.g., bytitle lookup)
                    title = row['Title']
                    doi_bytitle = scholarly.get_doi_from_title(str(title))
                    citation_count_title = scholarly.get_citation_count(doi_bytitle)
                    if citation_count_title is not None:
                        count += 1
                        st.text(f"DOI from Title: {title}, Citation Count: {citation_count_title}")
                    cit_array.append(citation_count_title)
                    
        # Add the citation count column to the DataFrame
        data['Citation Count'] = cit_array
        st.write(data)
        output_path = "output/crawler_doi_citation.csv"
        data.to_csv(output_path, sep=';', index=False)

    # Step 4: Download Filtered Data as CSV
    if not data.empty and step_3:
        st.download_button(
            label="Download Filtered Data as CSV",
            data=data.to_csv(index=False).encode(),
            file_name="crawler_doi_citation.csv",
            key="crawler_doi_citation",
        )