import streamlit as st
import xml.etree.ElementTree as ET
import csv

def knowledge_extraction():
    import streamlit as st
    import pandas as pd
    import scholarly
    
    st.title("CSV Data Viewer")
    
    def load_data():
        data = pd.read_csv("output/dblp.csv", sep=";", encoding="latin1")
        return data
    data = load_data()

    if st.button("generate dblp csv"):
        # Carica il file XML
        tree = ET.parse('output/dblp.xml')
        root = tree.getroot()
        # Crea un file CSV per scrivere i dati
        with open('output/dblp.csv', 'w', newline='', encoding='utf-8') as csvfile:
            csvwriter = csv.writer(csvfile, delimiter=';')

            # Scrivi l'intestazione del CSV
            csvwriter.writerow(['DBLP', 'Authors', 'Title', 'Year', 'DOI', 'URL'])

            # Estrai i dati XML e scrivili nel CSV
            for hit in root.findall('.//hit'):
                authors = ', '.join(author.text for author in hit.findall('.//authors/author'))
                title = hit.find('.//title').text
                year = hit.find('.//year').text
                doi_element = hit.find('.//doi')
                doi = doi_element.text if doi_element is not None else 'None'
                url = hit.find('.//url').text
                id = hit.attrib['id']
                

                # Scrivi i dati nel file CSV
                csvwriter.writerow([id, authors, title, year, doi, url])

        print("Conversione XML in CSV completata.")

    count_none = (data['DOI'] == 'None').sum()
    total_values = len(data['DOI'])
    st.write("Number of values in DOI column equal to None / Total values: " + str(count_none) + "/" + str(total_values))

    # Step 1: Display the data retrieved from DBLP
    step_1 = st.sidebar.checkbox("1 - Display the data retrieved from DBLP (Digital Bibliography & Library Project)")
    if step_1:
        st.write("Data from DBLP:")
        st.write(data)

    # Step 2: Iterate over rows and update 'doi' column if necessary
    step_2 = st.sidebar.checkbox("2 - Iterate over rows and update 'doi' column if necessary")
    if step_2:
        cit_array = []
        count = 0
        st.write(count)

        # Iterate over rows and update 'doi' column if necessary
        for index, row in data.iterrows():
            doi = row['DOI']
            title = row['Title']
            
            # If 'doi' is None, attempt to get DOI from title
            if doi == 'None':
                doi = scholarly.get_doi_from_title(title)
                # Update the DataFrame with the retrieved DOI
                if doi:
                    data.at[index, 'DOI'] = doi

        # Display the updated data table
        st.write("Data with DOI")
        count_none = (data['DOI'] == 'None').sum()
        total_values = len(data['DOI'])
        st.write("Number of values in DOI column equal to None / Total values: " + str(count_none) + "/" + str(total_values))
        st.write(data)


    # Step 3: Loop over DOIs and retrieve citation counts
    step_3 = st.sidebar.checkbox("3 - Loop over DOIs and retrieve citation counts")
    if step_3:
        cit_array = []
        count = 0

        # Loop over DOIs and retrieve citation counts
        for index, row in data.iterrows():
            doi = row['DOI']
            if doi:
                citation_count = scholarly.get_citation_count(doi)
                if citation_count is not None:
                    cit_array.append(citation_count)
                    st.text(f"DOI: {doi}, Citation Count: {citation_count}")
                    count += 1
                else:
                    # Handle cases where DOI is None (e.g., bytitle lookup)
                    title = row['Title']
                    doi_bytitle = scholarly.get_doi_from_title(str(title))
                    citation_count_title = scholarly.get_citation_count(doi_bytitle)
                    if citation_count_title is not None:
                        count += 1
                        st.text(f"DOI from Title: {title}, Citation Count: {citation_count_title}")
                    cit_array.append(citation_count_title)
                    
        # Add the citation count column to the DataFrame
        data['Citation Count'] = cit_array
        st.write(data)
        output_path = "output/crawler_doi_citation.csv"
        data.to_csv(output_path, sep=';', index=False)

    # Step 4: Download Filtered Data as CSV
    if not data.empty and step_3:
        st.download_button(
            label="Download Filtered Data as CSV",
            data=data.to_csv(index=False).encode(),
            file_name="crawler_doi_citation.csv",
            key="crawler_doi_citation",
        )