Spaces:

berangerthomas
/

topicminer

Paused

App Files Files Community

topicminer / app.py

berangerthomas

Ajout nltk

5d62944 11 months ago

raw

history blame contribute delete

5.19 kB

	import os
	import re
	import tempfile

	import nltk

	nltk.download("punkt_tab")
	nltk.download("stopwords")
	nltk.download("wordnet")

	import pymupdf4llm
	import streamlit as st
	from nltk.corpus import stopwords
	from nltk.stem import WordNetLemmatizer
	from sklearn.decomposition import LatentDirichletAllocation
	from sklearn.feature_extraction.text import CountVectorizer


	# Prétraitement du texte Markdown
	def preprocess_markdown(markdown_text):
	# Supprimer la syntaxe Markdown
	text = re.sub(r"#\|\\|_\|\[.?\]\|\(.?\)\|`.?`", "", markdown_text)

	# Tokenisation et nettoyage
	tokens = nltk.word_tokenize(text.lower())
	stop_words = set(stopwords.words("french")) # ou 'english' selon votre langue
	lemmatizer = WordNetLemmatizer()

	processed_tokens = [
	lemmatizer.lemmatize(token)
	for token in tokens
	if token.isalpha() and token not in stop_words
	]

	return " ".join(processed_tokens)


	def main():
	st.title("Topic Miner")

	# Add description
	st.write("Upload a PDF file to identify the main topics in the document.")

	# File uploader widget with drag and drop capability
	uploaded_file = st.file_uploader("Drag and drop a PDF file", type=["pdf"])

	md_text = None

	if uploaded_file is not None:
	# Display success message
	st.success("File successfully uploaded!")

	# Save the uploaded file to a temporary location
	with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
	tmp_file.write(uploaded_file.getvalue())
	pdf_path = tmp_file.name

	try:
	# Process the PDF using pymupdf4llm
	# Show processing indicator
	with st.spinner("Converting PDF to markdown..."):
	md_text = pymupdf4llm.to_markdown(pdf_path)
	md_text = md_text.encode("utf-8", errors="replace").decode("utf-8")
	st.success("Conversion complete!")

	# Display the markdown content
	st.subheader("Generated Markdown Content (first 20 lines):")
	# Limit display to first 20 lines
	md_lines = md_text.split("\n")
	if len(md_lines) > 20:
	md_text_display = "\n".join(md_lines[:20])
	st.markdown(md_text_display)
	st.info(f"Showing only first 20 lines of {len(md_lines)} total lines.")
	else:
	st.markdown(md_text)

	# Add download button for the complete markdown file
	# Get the original filename and replace extension
	original_filename = uploaded_file.name
	md_filename = os.path.splitext(original_filename)[0] + ".md"

	# Store file info in session state to avoid reprocessing
	if "processed_files" not in st.session_state:
	st.session_state.processed_files = {}

	file_key = f"{original_filename}_{uploaded_file.size}"
	st.session_state.processed_files[file_key] = md_text

	st.download_button(
	label="Download full markdown file",
	# data=md_text.encode("utf-8"),
	data=md_text,
	file_name=md_filename,
	mime="text/markdown",
	key=f"download_{file_key}", # Unique key prevents widget recreation
	)

	except Exception as e:
	st.error(f"Error processing PDF: {e}")

	try:
	# Extracting main topics using LDA from scikit-learn
	with st.spinner(
	"Extracting main topics using Latent Dirichlet Allocation..."
	):
	# Diviser le texte en paragraphes ou sections pour créer un corpus
	paragraphs = re.split(r"\n\n+", md_text)
	processed_paragraphs = [
	preprocess_markdown(p) for p in paragraphs if p.strip()
	]

	# Vectorisation
	count_vectorizer = CountVectorizer(max_features=1000)
	count_data = count_vectorizer.fit_transform(processed_paragraphs)

	# Application de LDA
	lda = LatentDirichletAllocation(n_components=5, random_state=0)
	lda.fit(count_data)

	# Extract top words for each topic
	feature_names = count_vectorizer.get_feature_names_out()
	n_top_words = 2
	topics = []

	for topic_idx, topic in enumerate(lda.components_):
	top_words_idx = topic.argsort()[: -n_top_words - 1 : -1]
	top_words = [feature_names[i] for i in top_words_idx]
	topics.append(f"Topic {topic_idx + 1}: {', '.join(top_words)}")

	st.success("Topics extracted!")

	# Display the extracted topics
	st.subheader("Main Topics:")
	for topic in topics:
	st.write(f"- {topic}")

	except Exception as e:
	st.error(f"Error extracting topics: {e}")

	finally:
	# Remove the temporary file
	os.unlink(pdf_path)

	return md_text


	if __name__ == "__main__":
	markdown_variable = main()