File size: 5,193 Bytes
0793b2f
 
 
 
 
b6d300f
 
5d62944
 
b6d300f
0793b2f
6f00b08
0793b2f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6f00b08
0793b2f
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import os
import re
import tempfile

import nltk

nltk.download("punkt_tab")
nltk.download("stopwords")
nltk.download("wordnet")

import pymupdf4llm
import streamlit as st
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer


# Prétraitement du texte Markdown
def preprocess_markdown(markdown_text):
    # Supprimer la syntaxe Markdown
    text = re.sub(r"#|\*|_|\[.*?\]|\(.*?\)|`.*?`", "", markdown_text)

    # Tokenisation et nettoyage
    tokens = nltk.word_tokenize(text.lower())
    stop_words = set(stopwords.words("french"))  # ou 'english' selon votre langue
    lemmatizer = WordNetLemmatizer()

    processed_tokens = [
        lemmatizer.lemmatize(token)
        for token in tokens
        if token.isalpha() and token not in stop_words
    ]

    return " ".join(processed_tokens)


def main():
    st.title("Topic Miner")

    # Add description
    st.write("Upload a PDF file to identify the main topics in the document.")

    # File uploader widget with drag and drop capability
    uploaded_file = st.file_uploader("Drag and drop a PDF file", type=["pdf"])

    md_text = None

    if uploaded_file is not None:
        # Display success message
        st.success("File successfully uploaded!")

        # Save the uploaded file to a temporary location
        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
            tmp_file.write(uploaded_file.getvalue())
            pdf_path = tmp_file.name

        try:
            # Process the PDF using pymupdf4llm
            # Show processing indicator
            with st.spinner("Converting PDF to markdown..."):
                md_text = pymupdf4llm.to_markdown(pdf_path)
                md_text = md_text.encode("utf-8", errors="replace").decode("utf-8")
                st.success("Conversion complete!")

            # Display the markdown content
            st.subheader("Generated Markdown Content (first 20 lines):")
            # Limit display to first 20 lines
            md_lines = md_text.split("\n")
            if len(md_lines) > 20:
                md_text_display = "\n".join(md_lines[:20])
                st.markdown(md_text_display)
                st.info(f"Showing only first 20 lines of {len(md_lines)} total lines.")
            else:
                st.markdown(md_text)

            # Add download button for the complete markdown file
            # Get the original filename and replace extension
            original_filename = uploaded_file.name
            md_filename = os.path.splitext(original_filename)[0] + ".md"

            # Store file info in session state to avoid reprocessing
            if "processed_files" not in st.session_state:
                st.session_state.processed_files = {}

            file_key = f"{original_filename}_{uploaded_file.size}"
            st.session_state.processed_files[file_key] = md_text

            st.download_button(
                label="Download full markdown file",
                # data=md_text.encode("utf-8"),
                data=md_text,
                file_name=md_filename,
                mime="text/markdown",
                key=f"download_{file_key}",  # Unique key prevents widget recreation
            )

        except Exception as e:
            st.error(f"Error processing PDF: {e}")

        try:
            # Extracting main topics using LDA from scikit-learn
            with st.spinner(
                "Extracting main topics using Latent Dirichlet Allocation..."
            ):
                # Diviser le texte en paragraphes ou sections pour créer un corpus
                paragraphs = re.split(r"\n\n+", md_text)
                processed_paragraphs = [
                    preprocess_markdown(p) for p in paragraphs if p.strip()
                ]

                # Vectorisation
                count_vectorizer = CountVectorizer(max_features=1000)
                count_data = count_vectorizer.fit_transform(processed_paragraphs)

                # Application de LDA
                lda = LatentDirichletAllocation(n_components=5, random_state=0)
                lda.fit(count_data)

                # Extract top words for each topic
                feature_names = count_vectorizer.get_feature_names_out()
                n_top_words = 2
                topics = []

                for topic_idx, topic in enumerate(lda.components_):
                    top_words_idx = topic.argsort()[: -n_top_words - 1 : -1]
                    top_words = [feature_names[i] for i in top_words_idx]
                    topics.append(f"Topic {topic_idx + 1}: {', '.join(top_words)}")

                st.success("Topics extracted!")

            # Display the extracted topics
            st.subheader("Main Topics:")
            for topic in topics:
                st.write(f"- {topic}")

        except Exception as e:
            st.error(f"Error extracting topics: {e}")

        finally:
            # Remove the temporary file
            os.unlink(pdf_path)

    return md_text


if __name__ == "__main__":
    markdown_variable = main()