import os import re import tempfile import nltk nltk.download("punkt_tab") nltk.download("stopwords") nltk.download("wordnet") import pymupdf4llm import streamlit as st from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer from sklearn.decomposition import LatentDirichletAllocation from sklearn.feature_extraction.text import CountVectorizer # Prétraitement du texte Markdown def preprocess_markdown(markdown_text): # Supprimer la syntaxe Markdown text = re.sub(r"#|\*|_|\[.*?\]|\(.*?\)|`.*?`", "", markdown_text) # Tokenisation et nettoyage tokens = nltk.word_tokenize(text.lower()) stop_words = set(stopwords.words("french")) # ou 'english' selon votre langue lemmatizer = WordNetLemmatizer() processed_tokens = [ lemmatizer.lemmatize(token) for token in tokens if token.isalpha() and token not in stop_words ] return " ".join(processed_tokens) def main(): st.title("Topic Miner") # Add description st.write("Upload a PDF file to identify the main topics in the document.") # File uploader widget with drag and drop capability uploaded_file = st.file_uploader("Drag and drop a PDF file", type=["pdf"]) md_text = None if uploaded_file is not None: # Display success message st.success("File successfully uploaded!") # Save the uploaded file to a temporary location with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file: tmp_file.write(uploaded_file.getvalue()) pdf_path = tmp_file.name try: # Process the PDF using pymupdf4llm # Show processing indicator with st.spinner("Converting PDF to markdown..."): md_text = pymupdf4llm.to_markdown(pdf_path) md_text = md_text.encode("utf-8", errors="replace").decode("utf-8") st.success("Conversion complete!") # Display the markdown content st.subheader("Generated Markdown Content (first 20 lines):") # Limit display to first 20 lines md_lines = md_text.split("\n") if len(md_lines) > 20: md_text_display = "\n".join(md_lines[:20]) st.markdown(md_text_display) st.info(f"Showing only first 20 lines of {len(md_lines)} total lines.") else: st.markdown(md_text) # Add download button for the complete markdown file # Get the original filename and replace extension original_filename = uploaded_file.name md_filename = os.path.splitext(original_filename)[0] + ".md" # Store file info in session state to avoid reprocessing if "processed_files" not in st.session_state: st.session_state.processed_files = {} file_key = f"{original_filename}_{uploaded_file.size}" st.session_state.processed_files[file_key] = md_text st.download_button( label="Download full markdown file", # data=md_text.encode("utf-8"), data=md_text, file_name=md_filename, mime="text/markdown", key=f"download_{file_key}", # Unique key prevents widget recreation ) except Exception as e: st.error(f"Error processing PDF: {e}") try: # Extracting main topics using LDA from scikit-learn with st.spinner( "Extracting main topics using Latent Dirichlet Allocation..." ): # Diviser le texte en paragraphes ou sections pour créer un corpus paragraphs = re.split(r"\n\n+", md_text) processed_paragraphs = [ preprocess_markdown(p) for p in paragraphs if p.strip() ] # Vectorisation count_vectorizer = CountVectorizer(max_features=1000) count_data = count_vectorizer.fit_transform(processed_paragraphs) # Application de LDA lda = LatentDirichletAllocation(n_components=5, random_state=0) lda.fit(count_data) # Extract top words for each topic feature_names = count_vectorizer.get_feature_names_out() n_top_words = 2 topics = [] for topic_idx, topic in enumerate(lda.components_): top_words_idx = topic.argsort()[: -n_top_words - 1 : -1] top_words = [feature_names[i] for i in top_words_idx] topics.append(f"Topic {topic_idx + 1}: {', '.join(top_words)}") st.success("Topics extracted!") # Display the extracted topics st.subheader("Main Topics:") for topic in topics: st.write(f"- {topic}") except Exception as e: st.error(f"Error extracting topics: {e}") finally: # Remove the temporary file os.unlink(pdf_path) return md_text if __name__ == "__main__": markdown_variable = main()