Spaces:

berangerthomas
/

topicminer

Paused

File size: 5,193 Bytes

import os
import re
import tempfile

import nltk

nltk.download("punkt_tab")
nltk.download("stopwords")
nltk.download("wordnet")

import pymupdf4llm
import streamlit as st
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer


# Prétraitement du texte Markdown
def preprocess_markdown(markdown_text):
    # Supprimer la syntaxe Markdown
    text = re.sub(r"#|\*|_|\[.*?\]|\(.*?\)|`.*?`", "", markdown_text)

    # Tokenisation et nettoyage
    tokens = nltk.word_tokenize(text.lower())
    stop_words = set(stopwords.words("french"))  # ou 'english' selon votre langue
    lemmatizer = WordNetLemmatizer()

    processed_tokens = [
        lemmatizer.lemmatize(token)
        for token in tokens
        if token.isalpha() and token not in stop_words
    ]

    return " ".join(processed_tokens)


def main():
    st.title("Topic Miner")

    # Add description
    st.write("Upload a PDF file to identify the main topics in the document.")

    # File uploader widget with drag and drop capability
    uploaded_file = st.file_uploader("Drag and drop a PDF file", type=["pdf"])

    md_text = None

    if uploaded_file is not None:
        # Display success message
        st.success("File successfully uploaded!")

        # Save the uploaded file to a temporary location
        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
            tmp_file.write(uploaded_file.getvalue())
            pdf_path = tmp_file.name

        try:
            # Process the PDF using pymupdf4llm
            # Show processing indicator
            with st.spinner("Converting PDF to markdown..."):
                md_text = pymupdf4llm.to_markdown(pdf_path)
                md_text = md_text.encode("utf-8", errors="replace").decode("utf-8")
                st.success("Conversion complete!")

            # Display the markdown content
            st.subheader("Generated Markdown Content (first 20 lines):")
            # Limit display to first 20 lines
            md_lines = md_text.split("\n")
            if len(md_lines) > 20:
                md_text_display = "\n".join(md_lines[:20])
                st.markdown(md_text_display)
                st.info(f"Showing only first 20 lines of {len(md_lines)} total lines.")
            else:
                st.markdown(md_text)

            # Add download button for the complete markdown file
            # Get the original filename and replace extension
            original_filename = uploaded_file.name
            md_filename = os.path.splitext(original_filename)[0] + ".md"

            # Store file info in session state to avoid reprocessing
            if "processed_files" not in st.session_state:
                st.session_state.processed_files = {}

            file_key = f"{original_filename}_{uploaded_file.size}"
            st.session_state.processed_files[file_key] = md_text

            st.download_button(
                label="Download full markdown file",
                # data=md_text.encode("utf-8"),
                data=md_text,
                file_name=md_filename,
                mime="text/markdown",
                key=f"download_{file_key}",  # Unique key prevents widget recreation
            )

        except Exception as e:
            st.error(f"Error processing PDF: {e}")

        try:
            # Extracting main topics using LDA from scikit-learn
            with st.spinner(
                "Extracting main topics using Latent Dirichlet Allocation..."
            ):
                # Diviser le texte en paragraphes ou sections pour créer un corpus
                paragraphs = re.split(r"\n\n+", md_text)
                processed_paragraphs = [
                    preprocess_markdown(p) for p in paragraphs if p.strip()
                ]

                # Vectorisation
                count_vectorizer = CountVectorizer(max_features=1000)
                count_data = count_vectorizer.fit_transform(processed_paragraphs)

                # Application de LDA
                lda = LatentDirichletAllocation(n_components=5, random_state=0)
                lda.fit(count_data)

                # Extract top words for each topic
                feature_names = count_vectorizer.get_feature_names_out()
                n_top_words = 2
                topics = []

                for topic_idx, topic in enumerate(lda.components_):
                    top_words_idx = topic.argsort()[: -n_top_words - 1 : -1]
                    top_words = [feature_names[i] for i in top_words_idx]
                    topics.append(f"Topic {topic_idx + 1}: {', '.join(top_words)}")

                st.success("Topics extracted!")

            # Display the extracted topics
            st.subheader("Main Topics:")
            for topic in topics:
                st.write(f"- {topic}")

        except Exception as e:
            st.error(f"Error extracting topics: {e}")

        finally:
            # Remove the temporary file
            os.unlink(pdf_path)

    return md_text


if __name__ == "__main__":
    markdown_variable = main()