Spaces:

berangerthomas
/

topicminer

Paused

App Files Files Community

berangerthomas commited on Feb 28, 2025

Commit

0793b2f

1 Parent(s): 43093db

Essai LDA

Browse files

Files changed (1) hide show

app.py +141 -2

app.py CHANGED Viewed

@@ -1,4 +1,143 @@
 import streamlit as st
-x = st.slider("Sélectionner une valeur")
-st.write(x, "Son carré est :", x * x)

+import os
+import re
+import tempfile
+import nltk
+import pymupdf4llm
 import streamlit as st
+from nltk.corpus import stopwords
+from nltk.stem import WordNetLemmatizer
+from sklearn.decomposition import LatentDirichletAllocation
+from sklearn.feature_extraction.text import CountVectorizer
+# Prétraitement du texte Markdown
+def preprocess_markdown(markdown_text):
+    # Supprimer la syntaxe Markdown
+    text = re.sub(r"#|\*|_|\[.*?\]|\(.*?\)|`.*?`", "", markdown_text)
+    # Tokenisation et nettoyage
+    tokens = nltk.word_tokenize(text.lower())
+    stop_words = set(stopwords.words("french"))  # ou 'english' selon votre langue
+    lemmatizer = WordNetLemmatizer()
+    processed_tokens = [
+        lemmatizer.lemmatize(token)
+        for token in tokens
+        if token.isalpha() and token not in stop_words
+    ]
+    return " ".join(processed_tokens)
+def main():
+    st.title("Topic Miner")
+    # Add description
+    st.write("Upload a PDF file to identify the main topics in the document.")
+    # File uploader widget with drag and drop capability
+    uploaded_file = st.file_uploader("Drag and drop a PDF file", type=["pdf"])
+    md_text = None
+    if uploaded_file is not None:
+        # Display success message
+        st.success("File successfully uploaded!")
+        # Save the uploaded file to a temporary location
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
+            tmp_file.write(uploaded_file.getvalue())
+            pdf_path = tmp_file.name
+        try:
+            # Process the PDF using pymupdf4llm
+            # Show processing indicator
+            with st.spinner("Converting PDF to markdown..."):
+                md_text = pymupdf4llm.to_markdown(pdf_path)
+                md_text = md_text.encode("utf-8", errors="replace").decode("utf-8")
+                st.success("Conversion complete!")
+            # Display the markdown content
+            st.subheader("Generated Markdown Content (first 20 lines):")
+            # Limit display to first 20 lines
+            md_lines = md_text.split("\n")
+            if len(md_lines) > 20:
+                md_text_display = "\n".join(md_lines[:20])
+                st.markdown(md_text_display)
+                st.info(f"Showing only first 20 lines of {len(md_lines)} total lines.")
+            else:
+                st.markdown(md_text)
+            # Add download button for the complete markdown file
+            # Get the original filename and replace extension
+            original_filename = uploaded_file.name
+            md_filename = os.path.splitext(original_filename)[0] + ".md"
+            # Store file info in session state to avoid reprocessing
+            if "processed_files" not in st.session_state:
+                st.session_state.processed_files = {}
+            file_key = f"{original_filename}_{uploaded_file.size}"
+            st.session_state.processed_files[file_key] = md_text
+            st.download_button(
+                label="Download full markdown file",
+                # data=md_text.encode("utf-8"),
+                data=md_text,
+                file_name=md_filename,
+                mime="text/markdown",
+                key=f"download_{file_key}",  # Unique key prevents widget recreation
+            )
+        except Exception as e:
+            st.error(f"Error processing PDF: {e}")
+        try:
+            # Extracting main topics using LDA from scikit-learn
+            with st.spinner(
+                "Extracting main topics using Latent Dirichlet Allocation..."
+            ):
+                # Diviser le texte en paragraphes ou sections pour créer un corpus
+                paragraphs = re.split(r"\n\n+", md_text)
+                processed_paragraphs = [
+                    preprocess_markdown(p) for p in paragraphs if p.strip()
+                ]
+                # Vectorisation
+                count_vectorizer = CountVectorizer(max_features=1000)
+                count_data = count_vectorizer.fit_transform(processed_paragraphs)
+                # Application de LDA
+                lda = LatentDirichletAllocation(n_components=5, random_state=0)
+                lda.fit(count_data)
+                # Extract top words for each topic
+                feature_names = count_vectorizer.get_feature_names_out()
+                n_top_words = 2
+                topics = []
+                for topic_idx, topic in enumerate(lda.components_):
+                    top_words_idx = topic.argsort()[: -n_top_words - 1 : -1]
+                    top_words = [feature_names[i] for i in top_words_idx]
+                    topics.append(f"Topic {topic_idx + 1}: {', '.join(top_words)}")
+                st.success("Topics extracted!")
+            # Display the extracted topics
+            st.subheader("Main Topics:")
+            for topic in topics:
+                st.write(f"- {topic}")
+        except Exception as e:
+            st.error(f"Error extracting topics: {e}")
+        finally:
+            # Remove the temporary file
+            os.unlink(pdf_path)
+    return md_text
+if __name__ == "__main__":
+    markdown_variable = main()