Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -38,6 +38,7 @@ from moviepy import VideoFileClip
|
|
| 38 |
import yt_dlp
|
| 39 |
from youtube_transcript_api import YouTubeTranscriptApi
|
| 40 |
from urllib.parse import urlparse, parse_qs
|
|
|
|
| 41 |
from ratelimit import limits, sleep_and_retry
|
| 42 |
import time
|
| 43 |
import fasttext
|
|
@@ -51,6 +52,7 @@ from PyPDF2 import PdfReader
|
|
| 51 |
from pptx import Presentation
|
| 52 |
import trafilatura
|
| 53 |
from bs4 import BeautifulSoup
|
|
|
|
| 54 |
from dotenv import load_dotenv
|
| 55 |
|
| 56 |
load_dotenv()
|
|
@@ -1084,6 +1086,42 @@ def process_document_with_password(file, password: str, doc_processor: DocumentP
|
|
| 1084 |
st.error(str(e))
|
| 1085 |
return None
|
| 1086 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1087 |
def process_web():
|
| 1088 |
"""Traitement des contenus web"""
|
| 1089 |
url = st.text_input("URL du site web")
|
|
@@ -1096,6 +1134,18 @@ def process_web():
|
|
| 1096 |
auth = {"username": username, "password": password}
|
| 1097 |
|
| 1098 |
if url and st.button("Analyser"):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1099 |
try:
|
| 1100 |
doc_processor = DocumentProcessor(
|
| 1101 |
st.session_state.audio_processor.llm.model_name,
|
|
|
|
| 38 |
import yt_dlp
|
| 39 |
from youtube_transcript_api import YouTubeTranscriptApi
|
| 40 |
from urllib.parse import urlparse, parse_qs
|
| 41 |
+
import mimetypes
|
| 42 |
from ratelimit import limits, sleep_and_retry
|
| 43 |
import time
|
| 44 |
import fasttext
|
|
|
|
| 52 |
from pptx import Presentation
|
| 53 |
import trafilatura
|
| 54 |
from bs4 import BeautifulSoup
|
| 55 |
+
|
| 56 |
from dotenv import load_dotenv
|
| 57 |
|
| 58 |
load_dotenv()
|
|
|
|
| 1086 |
st.error(str(e))
|
| 1087 |
return None
|
| 1088 |
|
| 1089 |
+
|
| 1090 |
+
|
| 1091 |
+
|
| 1092 |
+
def is_text_content(url):
|
| 1093 |
+
try:
|
| 1094 |
+
# Utiliser Selenium ou Playwright pour le rendu JavaScript
|
| 1095 |
+
response = requests.get(url)
|
| 1096 |
+
return ('text' in response.headers.get('content-type', '').lower()
|
| 1097 |
+
or 'html' in response.headers.get('content-type', '').lower()
|
| 1098 |
+
or 'application/json' in response.headers.get('content-type', '').lower())
|
| 1099 |
+
except:
|
| 1100 |
+
return False
|
| 1101 |
+
|
| 1102 |
+
def is_valid_content_url(url):
|
| 1103 |
+
"""Vérifie si l'URL est valide pour l'extraction de contenu"""
|
| 1104 |
+
parsed = urlparse(url)
|
| 1105 |
+
|
| 1106 |
+
excluded_domains = [
|
| 1107 |
+
'youtube.com', 'vimeo.com', 'dailymotion.com',
|
| 1108 |
+
'imgur.com', 'flickr.com', 'instagram.com',
|
| 1109 |
+
'facebook.com', 'fb.com', 'twitter.com', 'x.com',
|
| 1110 |
+
'tiktok.com', 'linkedin.com', 'pinterest.com',
|
| 1111 |
+
'snapchat.com', 'reddit.com', 'tumblr.com',
|
| 1112 |
+
'whatsapp.com', 'telegram.org', 'discord.com'
|
| 1113 |
+
]
|
| 1114 |
+
|
| 1115 |
+
excluded_extensions = ['.jpg', '.jpeg', '.png', '.gif', '.mp4', '.mp3', '.pdf']
|
| 1116 |
+
|
| 1117 |
+
domain = parsed.netloc.lower()
|
| 1118 |
+
path = parsed.path.lower()
|
| 1119 |
+
|
| 1120 |
+
return not (
|
| 1121 |
+
any(exc in domain for exc in excluded_domains) or
|
| 1122 |
+
any(path.endswith(ext) for ext in excluded_extensions)
|
| 1123 |
+
)
|
| 1124 |
+
|
| 1125 |
def process_web():
|
| 1126 |
"""Traitement des contenus web"""
|
| 1127 |
url = st.text_input("URL du site web")
|
|
|
|
| 1134 |
auth = {"username": username, "password": password}
|
| 1135 |
|
| 1136 |
if url and st.button("Analyser"):
|
| 1137 |
+
if not url.startswith(('http://', 'https://')):
|
| 1138 |
+
st.error("L'URL doit commencer par 'http://' ou 'https://'")
|
| 1139 |
+
return
|
| 1140 |
+
|
| 1141 |
+
if not is_valid_content_url(url):
|
| 1142 |
+
st.error(f"Cette URL ({url}) ne peut pas être traitée (vidéo, image ou autre contenu non supporté)")
|
| 1143 |
+
return
|
| 1144 |
+
|
| 1145 |
+
if not is_text_content(url):
|
| 1146 |
+
st.error(f"Cette URL ({url}) ne contient pas de contenu textuel analysable")
|
| 1147 |
+
return
|
| 1148 |
+
|
| 1149 |
try:
|
| 1150 |
doc_processor = DocumentProcessor(
|
| 1151 |
st.session_state.audio_processor.llm.model_name,
|