Spaces:
Running
Running
Ilyas KHIAT
commited on
Commit
·
bc557f4
1
Parent(s):
62a5ad1
more details content
Browse files- app.py +26 -23
- utils/audit/audit_doc.py +3 -7
- utils/audit/transcript_audio.py +0 -3
app.py
CHANGED
|
@@ -116,25 +116,28 @@ def main():
|
|
| 116 |
# Streamlit app
|
| 117 |
st.title("AUDIT DES DOCUMENTS")
|
| 118 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
if "audit" not in st.session_state:
|
| 120 |
st.session_state.audit = {}
|
| 121 |
if "name_file" not in st.session_state:
|
| 122 |
st.session_state.name_file = ""
|
| 123 |
|
| 124 |
# File uploader
|
| 125 |
-
uploaded_file = st.file_uploader("Télécharger un ou plusieurs
|
| 126 |
|
| 127 |
if uploaded_file is not None:
|
| 128 |
type = classify_file(uploaded_file)
|
| 129 |
|
| 130 |
st.write(f"Type de fichier: {type}")
|
| 131 |
|
| 132 |
-
st.write("###
|
| 133 |
|
| 134 |
if type == "pdf":
|
| 135 |
display_audit_pdf(uploaded_file)
|
| 136 |
|
| 137 |
-
|
| 138 |
elif type == "audio":
|
| 139 |
if st.session_state.name_file != uploaded_file.name:
|
| 140 |
st.session_state.name_file = uploaded_file.name
|
|
@@ -179,28 +182,28 @@ def main():
|
|
| 179 |
|
| 180 |
st.code(well_formatted_audit)
|
| 181 |
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
|
| 203 |
-
|
| 204 |
|
| 205 |
|
| 206 |
if __name__ == "__main__":
|
|
|
|
| 116 |
# Streamlit app
|
| 117 |
st.title("AUDIT DES DOCUMENTS")
|
| 118 |
|
| 119 |
+
notice = "Les formats autorisés sont les suivants :\n- **format texte** : txt, word, pdf\n- **format image** : png, jpg\n- **format audio** : wav, MP3"
|
| 120 |
+
|
| 121 |
+
st.markdown(notice)
|
| 122 |
+
|
| 123 |
if "audit" not in st.session_state:
|
| 124 |
st.session_state.audit = {}
|
| 125 |
if "name_file" not in st.session_state:
|
| 126 |
st.session_state.name_file = ""
|
| 127 |
|
| 128 |
# File uploader
|
| 129 |
+
uploaded_file = st.file_uploader("Télécharger un ou plusieurs documents")
|
| 130 |
|
| 131 |
if uploaded_file is not None:
|
| 132 |
type = classify_file(uploaded_file)
|
| 133 |
|
| 134 |
st.write(f"Type de fichier: {type}")
|
| 135 |
|
| 136 |
+
st.write("### Synthèse audit du ou des document(s) téléchargé(s)")
|
| 137 |
|
| 138 |
if type == "pdf":
|
| 139 |
display_audit_pdf(uploaded_file)
|
| 140 |
|
|
|
|
| 141 |
elif type == "audio":
|
| 142 |
if st.session_state.name_file != uploaded_file.name:
|
| 143 |
st.session_state.name_file = uploaded_file.name
|
|
|
|
| 182 |
|
| 183 |
st.code(well_formatted_audit)
|
| 184 |
|
| 185 |
+
elif type == "word":
|
| 186 |
+
if st.session_state.name_file != uploaded_file.name:
|
| 187 |
+
st.session_state.name_file = uploaded_file.name
|
| 188 |
+
with st.spinner("Analyse du document..."):
|
| 189 |
+
st.session_state.audit = audit_descriptif_word(uploaded_file)
|
| 190 |
+
audit = st.session_state.audit
|
| 191 |
+
|
| 192 |
+
#global audit
|
| 193 |
+
audit_simplified = {
|
| 194 |
+
"Nombre de pages": audit["number_of_paragraphs"],
|
| 195 |
+
"Nombre d'images": audit["number_of_images"],
|
| 196 |
+
"Nombre de liens": audit["number_of_links"],
|
| 197 |
+
"Nombre de tableaux": audit["number_of_tables"],
|
| 198 |
+
"Nombre de tokens": audit["number_of_tokens"],
|
| 199 |
+
"Nombre de mots": audit["number_of_words"]
|
| 200 |
+
}
|
| 201 |
|
| 202 |
+
well_formatted_audit = "Contenus audités\n"
|
| 203 |
+
for key, value in audit_simplified.items():
|
| 204 |
+
well_formatted_audit += f"- {key}: {value}\n"
|
| 205 |
|
| 206 |
+
st.code(well_formatted_audit)
|
| 207 |
|
| 208 |
|
| 209 |
if __name__ == "__main__":
|
utils/audit/audit_doc.py
CHANGED
|
@@ -54,10 +54,12 @@ def evaluate_text_quality(text: str) -> dict:
|
|
| 54 |
global_score_0_5 = global_score * 5
|
| 55 |
|
| 56 |
def extract_keywords(text):
|
| 57 |
-
rake = Rake(stopwords.words('
|
| 58 |
rake.extract_keywords_from_text(text)
|
| 59 |
return rake.get_ranked_phrases()
|
| 60 |
|
|
|
|
|
|
|
| 61 |
def count_tokens(input_string: str) -> int:
|
| 62 |
tokenizer = tiktoken.get_encoding("cl100k_base")
|
| 63 |
tokens = tokenizer.encode(input_string)
|
|
@@ -169,12 +171,6 @@ def audit_text(text: str) -> dict:
|
|
| 169 |
|
| 170 |
return audit_dict
|
| 171 |
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
def count_tokens(text):
|
| 175 |
-
# Implement a token counting method. Here, we assume tokens are words.
|
| 176 |
-
return len(text.split())
|
| 177 |
-
|
| 178 |
def audit_descriptif_word(file) -> dict:
|
| 179 |
document = Document(io.BytesIO(file.read()))
|
| 180 |
|
|
|
|
| 54 |
global_score_0_5 = global_score * 5
|
| 55 |
|
| 56 |
def extract_keywords(text):
|
| 57 |
+
rake = Rake(stopwords.words('french'))
|
| 58 |
rake.extract_keywords_from_text(text)
|
| 59 |
return rake.get_ranked_phrases()
|
| 60 |
|
| 61 |
+
|
| 62 |
+
|
| 63 |
def count_tokens(input_string: str) -> int:
|
| 64 |
tokenizer = tiktoken.get_encoding("cl100k_base")
|
| 65 |
tokens = tokenizer.encode(input_string)
|
|
|
|
| 171 |
|
| 172 |
return audit_dict
|
| 173 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 174 |
def audit_descriptif_word(file) -> dict:
|
| 175 |
document = Document(io.BytesIO(file.read()))
|
| 176 |
|
utils/audit/transcript_audio.py
CHANGED
|
@@ -1,8 +1,5 @@
|
|
| 1 |
from openai import OpenAI
|
| 2 |
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
def transcript_audio_func(audio_file):
|
| 7 |
client = OpenAI()
|
| 8 |
transcription = client.audio.transcriptions.create(
|
|
|
|
| 1 |
from openai import OpenAI
|
| 2 |
|
|
|
|
|
|
|
|
|
|
| 3 |
def transcript_audio_func(audio_file):
|
| 4 |
client = OpenAI()
|
| 5 |
transcription = client.audio.transcriptions.create(
|