berangerthomas commited on
Commit
0793b2f
·
1 Parent(s): 43093db
Files changed (1) hide show
  1. app.py +141 -2
app.py CHANGED
@@ -1,4 +1,143 @@
 
 
 
 
 
 
1
  import streamlit as st
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
- x = st.slider("Sélectionner une valeur")
4
- st.write(x, "Son carré est :", x * x)
 
1
+ import os
2
+ import re
3
+ import tempfile
4
+
5
+ import nltk
6
+ import pymupdf4llm
7
  import streamlit as st
8
+ from nltk.corpus import stopwords
9
+ from nltk.stem import WordNetLemmatizer
10
+ from sklearn.decomposition import LatentDirichletAllocation
11
+ from sklearn.feature_extraction.text import CountVectorizer
12
+
13
+
14
+ # Prétraitement du texte Markdown
15
+ def preprocess_markdown(markdown_text):
16
+ # Supprimer la syntaxe Markdown
17
+ text = re.sub(r"#|\*|_|\[.*?\]|\(.*?\)|`.*?`", "", markdown_text)
18
+
19
+ # Tokenisation et nettoyage
20
+ tokens = nltk.word_tokenize(text.lower())
21
+ stop_words = set(stopwords.words("french")) # ou 'english' selon votre langue
22
+ lemmatizer = WordNetLemmatizer()
23
+
24
+ processed_tokens = [
25
+ lemmatizer.lemmatize(token)
26
+ for token in tokens
27
+ if token.isalpha() and token not in stop_words
28
+ ]
29
+
30
+ return " ".join(processed_tokens)
31
+
32
+
33
+ def main():
34
+ st.title("Topic Miner")
35
+
36
+ # Add description
37
+ st.write("Upload a PDF file to identify the main topics in the document.")
38
+
39
+ # File uploader widget with drag and drop capability
40
+ uploaded_file = st.file_uploader("Drag and drop a PDF file", type=["pdf"])
41
+
42
+ md_text = None
43
+
44
+ if uploaded_file is not None:
45
+ # Display success message
46
+ st.success("File successfully uploaded!")
47
+
48
+ # Save the uploaded file to a temporary location
49
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
50
+ tmp_file.write(uploaded_file.getvalue())
51
+ pdf_path = tmp_file.name
52
+
53
+ try:
54
+ # Process the PDF using pymupdf4llm
55
+ # Show processing indicator
56
+ with st.spinner("Converting PDF to markdown..."):
57
+ md_text = pymupdf4llm.to_markdown(pdf_path)
58
+ md_text = md_text.encode("utf-8", errors="replace").decode("utf-8")
59
+ st.success("Conversion complete!")
60
+
61
+ # Display the markdown content
62
+ st.subheader("Generated Markdown Content (first 20 lines):")
63
+ # Limit display to first 20 lines
64
+ md_lines = md_text.split("\n")
65
+ if len(md_lines) > 20:
66
+ md_text_display = "\n".join(md_lines[:20])
67
+ st.markdown(md_text_display)
68
+ st.info(f"Showing only first 20 lines of {len(md_lines)} total lines.")
69
+ else:
70
+ st.markdown(md_text)
71
+
72
+ # Add download button for the complete markdown file
73
+ # Get the original filename and replace extension
74
+ original_filename = uploaded_file.name
75
+ md_filename = os.path.splitext(original_filename)[0] + ".md"
76
+
77
+ # Store file info in session state to avoid reprocessing
78
+ if "processed_files" not in st.session_state:
79
+ st.session_state.processed_files = {}
80
+
81
+ file_key = f"{original_filename}_{uploaded_file.size}"
82
+ st.session_state.processed_files[file_key] = md_text
83
+
84
+ st.download_button(
85
+ label="Download full markdown file",
86
+ # data=md_text.encode("utf-8"),
87
+ data=md_text,
88
+ file_name=md_filename,
89
+ mime="text/markdown",
90
+ key=f"download_{file_key}", # Unique key prevents widget recreation
91
+ )
92
+
93
+ except Exception as e:
94
+ st.error(f"Error processing PDF: {e}")
95
+
96
+ try:
97
+ # Extracting main topics using LDA from scikit-learn
98
+ with st.spinner(
99
+ "Extracting main topics using Latent Dirichlet Allocation..."
100
+ ):
101
+ # Diviser le texte en paragraphes ou sections pour créer un corpus
102
+ paragraphs = re.split(r"\n\n+", md_text)
103
+ processed_paragraphs = [
104
+ preprocess_markdown(p) for p in paragraphs if p.strip()
105
+ ]
106
+
107
+ # Vectorisation
108
+ count_vectorizer = CountVectorizer(max_features=1000)
109
+ count_data = count_vectorizer.fit_transform(processed_paragraphs)
110
+
111
+ # Application de LDA
112
+ lda = LatentDirichletAllocation(n_components=5, random_state=0)
113
+ lda.fit(count_data)
114
+
115
+ # Extract top words for each topic
116
+ feature_names = count_vectorizer.get_feature_names_out()
117
+ n_top_words = 2
118
+ topics = []
119
+
120
+ for topic_idx, topic in enumerate(lda.components_):
121
+ top_words_idx = topic.argsort()[: -n_top_words - 1 : -1]
122
+ top_words = [feature_names[i] for i in top_words_idx]
123
+ topics.append(f"Topic {topic_idx + 1}: {', '.join(top_words)}")
124
+
125
+ st.success("Topics extracted!")
126
+
127
+ # Display the extracted topics
128
+ st.subheader("Main Topics:")
129
+ for topic in topics:
130
+ st.write(f"- {topic}")
131
+
132
+ except Exception as e:
133
+ st.error(f"Error extracting topics: {e}")
134
+
135
+ finally:
136
+ # Remove the temporary file
137
+ os.unlink(pdf_path)
138
+
139
+ return md_text
140
+
141
 
142
+ if __name__ == "__main__":
143
+ markdown_variable = main()