Spaces:

mrwabnalas40
/

Because

Sleeping

App Files Files Community

Because / File_analyzer.py

mrwabnalas40

Update File_analyzer.py

6a500c8 verified 3 months ago

raw

history blame contribute delete

24.9 kB

	"""
	file_analyzer.py - الإصدار المحسن والمصحح

	وصف: خدمة ويب بسيطة تستقبل تحميل ملفات (MS Office: .docx, .xlsx, .pptx), PDF, .txt, .json, .xml
	ثم تقوم بتحليلها وإرجاع نتيجة تحليل JSON تحتوي على: نوع الملف، حجم، نص مستخرج، إحصاءات بسيطة (عدد الكلمات، الكلمات الشائعة)، وملاحظات خاصة بكل نوع (عدد الصفحات للـ PDF، أوراق العمل في Excel ..)

	المتطلبات (ثبتها قبل التشغيل):
	pip install flask werkzeug python-docx openpyxl python-pptx pypdf2 langdetect chardet

	ملاحظات:
	- PyPDF2 يستخدم لاستخراج نص من PDF (بسيط). لنتائج أفضل مع PDF الممسوحة ضوئياً استخدم OCR مثل pytesseract (لم يُدرج هنا).
	- هذا سكربت تعليمي — تأكد من تشغيله في بيئة آمنة عند استقبال ملفات من الخارج.

	تشغيل:
	python file_analyzer.py
	ثم افتح http://127.0.0.1:4580 وادخل إلى endpoint /upload (POST) لرفع ملف (form field name = file)

	"""
	from flask import Flask, request, jsonify
	from werkzeug.utils import secure_filename
	import os
	import io
	import json
	import xml.etree.ElementTree as ET
	from collections import Counter
	import chardet
	import re
	import zipfile
	from datetime import datetime
	import hashlib

	# استيراد المكتبات مع معالجة الأخطاء
	try:
	from docx import Document
	DOCX_AVAILABLE = True
	except ImportError:
	DOCX_AVAILABLE = False
	print("⚠️ تحذير: python-docx غير مثبت - لن يتم دعم ملفات Word")

	try:
	from openpyxl import load_workbook
	EXCEL_AVAILABLE = True
	except ImportError:
	EXCEL_AVAILABLE = False
	print("⚠️ تحذير: openpyxl غير مثبت - لن يتم دعم ملفات Excel")

	try:
	from pptx import Presentation
	PPTX_AVAILABLE = True
	except ImportError:
	PPTX_AVAILABLE = False
	print("⚠️ تحذير: python-pptx غير مثبت - لن يتم دعم ملفات PowerPoint")

	try:
	from PyPDF2 import PdfReader
	PDF_AVAILABLE = True
	except ImportError:
	PDF_AVAILABLE = False
	print("⚠️ تحذير: PyPDF2 غير مثبت - لن يتم دعم ملفات PDF")

	try:
	from langdetect import detect, DetectorFactory
	DetectorFactory.seed = 0 # ثابت للنتائج المتوقعة في الكشف عن اللغة
	LANGDETECT_AVAILABLE = True
	except ImportError:
	LANGDETECT_AVAILABLE = False
	print("⚠️ تحذير: langdetect غير مثبت - لن يتم دعم كشف اللغة")

	# تحديد الصيغ المدعومة بناءً على المكتبات المتوفرة
	ALLOWED_EXTENSIONS = set(['txt', 'json', 'xml'])
	if DOCX_AVAILABLE:
	ALLOWED_EXTENSIONS.add('docx')
	if EXCEL_AVAILABLE:
	ALLOWED_EXTENSIONS.add('xlsx')
	if PPTX_AVAILABLE:
	ALLOWED_EXTENSIONS.add('pptx')
	if PDF_AVAILABLE:
	ALLOWED_EXTENSIONS.add('pdf')

	UPLOAD_FOLDER = 'uploads'
	MAX_FILE_SIZE = 200 * 1024 * 1024 # 200MB
	MAX_MEMORY_CHUNK = 50 * 1024 * 1024 # 50MB للقراءة التدريجية

	os.makedirs(UPLOAD_FOLDER, exist_ok=True)
	app = Flask(__name__)
	app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
	app.config['MAX_CONTENT_LENGTH'] = MAX_FILE_SIZE

	# إعداد سجل الأخطاء
	import logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)


	def allowed_file(filename):
	"""التحقق من أن امتداد الملف مسموح به"""
	return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS


	def safe_read_file(file_stream, max_size=MAX_MEMORY_CHUNK):
	"""قراءة الملف بشكل آمن مع تحديد حد أقصى للحجم"""
	content = b''
	bytes_read = 0

	while bytes_read < max_size:
	chunk = file_stream.read(8192) # قراءة قطع صغيرة
	if not chunk:
	break
	content += chunk
	bytes_read += len(chunk)

	if bytes_read >= max_size:
	logger.warning(f"File exceeded memory limit, truncated to {max_size} bytes")
	break

	return content


	def check_zip_bomb(file_path, max_ratio=100, max_files=10000):
	"""الكشف عن هجمات Zip Bomb للملفات المضغوطة (docx, xlsx, pptx)"""
	if file_path.endswith(('.docx', '.xlsx', '.pptx')):
	try:
	with zipfile.ZipFile(file_path, 'r') as zf:
	total_size = 0
	uncompressed_size = 0
	file_count = 0

	for info in zf.infolist():
	file_count += 1
	uncompressed_size += info.file_size
	total_size += info.compress_size

	if file_count > max_files:
	raise ValueError("Too many files in archive - possible zip bomb")

	if total_size > 0 and uncompressed_size / total_size > max_ratio:
	raise ValueError("Compression ratio too high - possible zip bomb")

	except Exception as e:
	logger.error(f"Zip bomb check failed: {str(e)}")
	raise


	def get_text_stats(text):
	"""تحليل النص وإرجاع إحصاءات مفصلة"""
	# تنظيف نصي متقدم
	words = [w for w in ''.join(ch if (ch.isalnum() or ch.isspace()) else ' ' for ch in text).split() if w]
	total_words = len(words)
	total_chars = len(text)
	total_chars_no_spaces = len(text.replace(' ', ''))

	counter = Counter(w.lower() for w in words)
	top_words = counter.most_common(20)

	# كشف اللغة
	language = None
	try:
	if LANGDETECT_AVAILABLE and total_words >= 3:
	language = detect(' '.join(words[:1000]))
	except Exception:
	language = None

	# إحصاءات إضافية
	paragraphs = [p for p in text.split('\n') if p.strip()]
	sentences = [s for s in re.split(r'[.!?]+', text) if s.strip()]

	# تحليل التواريخ
	date_patterns = [
	r'\d{1,2}/\d{1,2}/\d{4}',
	r'\d{4}-\d{2}-\d{2}',
	r'\d{1,2}-\d{1,2}-\d{4}'
	]
	dates_found = []
	for pattern in date_patterns:
	dates_found.extend(re.findall(pattern, text))

	# تحليل عناوين البريد الإلكتروني
	emails = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z\|a-z]{2,}\b', text)

	# تحليل الأرقام الهاتفية
	phones = re.findall(r'[\+]?[1-9]?[0-9]{7,14}', text)

	return {
	'total_words': total_words,
	'total_characters': total_chars,
	'total_characters_no_spaces': total_chars_no_spaces,
	'paragraphs_count': len(paragraphs),
	'sentences_count': len(sentences),
	'top_words': top_words,
	'language': language,
	'dates_found': dates_found[:10], # أول 10 تواريخ فقط
	'emails_found': emails[:10], # أول 10 عناوين بريد فقط
	'phones_found': phones[:10], # أول 10 أرقام فقط
	'estimated_reading_minutes': max(1, total_words // 200), # افتراض 200 كلمة/دقيقة
	'unique_words_count': len(set(words)),
	'average_word_length': sum(len(word) for word in words) / len(words) if words else 0
	}


	def extract_metadata(file_path):
	"""استخراج البيانات الوصفية للملف"""
	try:
	stat = os.stat(file_path)
	return {
	'file_hash_md5': calculate_file_hash(file_path),
	'created_time': datetime.fromtimestamp(stat.st_ctime).isoformat(),
	'modified_time': datetime.fromtimestamp(stat.st_mtime).isoformat(),
	'file_size_mb': round(stat.st_size / (1024 * 1024), 2),
	'file_size_bytes': stat.st_size
	}
	except Exception as e:
	logger.error(f"Metadata extraction failed: {str(e)}")
	return {}


	def calculate_file_hash(file_path):
	"""حساب بصمة الملف"""
	hasher = hashlib.md5()
	try:
	with open(file_path, 'rb') as f:
	for chunk in iter(lambda: f.read(4096), b""):
	hasher.update(chunk)
	return hasher.hexdigest()
	except Exception as e:
	logger.error(f"Hash calculation failed: {str(e)}")
	return "error_calculating_hash"


	def analyze_txt(file_stream):
	"""تحليل الملفات النصية"""
	raw = safe_read_file(file_stream)

	# كشف الترميز
	try:
	enc = chardet.detect(raw)['encoding'] or 'utf-8'
	text = raw.decode(enc, errors='replace')
	except Exception as e:
	logger.warning(f"Encoding detection failed, using utf-8: {str(e)}")
	text = raw.decode('utf-8', errors='replace')

	stats = get_text_stats(text)
	return {
	'text_preview': text[:4580], # عرض أول 4580 حرف فقط
	'stats': stats,
	'metadata': {'encoding': enc},
	'analysis_type': 'text'
	}


	def analyze_json(file_stream):
	"""تحليل ملفات JSON"""
	raw = safe_read_file(file_stream)

	try:
	text_content = raw.decode('utf-8', errors='replace')
	obj = json.loads(text_content)
	except Exception as e:
	return {'error': 'invalid_json', 'exception': str(e), 'analysis_type': 'json'}

	# استخراج النص من الهيكل
	def extract_strings(o):
	if isinstance(o, str):
	return [o]
	if isinstance(o, dict):
	res = []
	for k, v in o.items():
	res.append(k) # إضافة المفاتيح أيضاً
	res += extract_strings(v)
	return res
	if isinstance(o, list):
	res = []
	for v in o:
	res += extract_strings(v)
	return res
	return []

	all_text = ' '.join(extract_strings(obj))
	stats = get_text_stats(all_text)

	return {
	'json_preview': obj if isinstance(obj, (dict, list)) and len(str(obj)) < 10000 else str(type(obj)),
	'structure_type': type(obj).__name__,
	'stats': stats,
	'analysis_type': 'json'
	}


	def analyze_xml(file_stream):
	"""تحليل ملفات XML بشكل آمن"""
	raw = safe_read_file(file_stream)

	try:
	# منع هجمات XXE
	parser = ET.XMLParser(resolve_entities=False, no_network=True)
	root = ET.fromstring(raw, parser=parser)
	except Exception as e:
	return {'error': 'invalid_xml', 'exception': str(e), 'analysis_type': 'xml'}

	# استخراج نص من العناصر
	texts = []
	for elem in root.iter():
	if elem.text and elem.text.strip():
	texts.append(elem.text.strip())
	if elem.tail and elem.tail.strip():
	texts.append(elem.tail.strip())

	all_text = ' '.join(texts)
	stats = get_text_stats(all_text)

	return {
	'root_tag': root.tag,
	'stats': stats,
	'elements_count': len(list(root.iter())),
	'text_elements_count': len(texts),
	'analysis_type': 'xml'
	}


	def analyze_docx(path_or_stream):
	"""تحليل ملفات Word"""
	if not DOCX_AVAILABLE:
	return {'error': 'docx_support_not_available', 'analysis_type': 'docx'}

	try:
	doc = Document(path_or_stream)
	texts = []

	# الفقرات
	for p in doc.paragraphs:
	if p.text and p.text.strip():
	texts.append(p.text.strip())

	# الجداول
	for table in doc.tables:
	for row in table.rows:
	for cell in row.cells:
	if cell.text and cell.text.strip():
	texts.append(cell.text.strip())

	# الرؤوس والتذييلات
	for section in doc.sections:
	if section.header:
	for p in section.header.paragraphs:
	if p.text and p.text.strip():
	texts.append(p.text.strip())
	if section.footer:
	for p in section.footer.paragraphs:
	if p.text and p.text.strip():
	texts.append(p.text.strip())

	all_text = '\n'.join(texts)
	stats = get_text_stats(all_text)

	return {
	'stats': stats,
	'text_preview': all_text[:10000],
	'paragraphs_count': len(doc.paragraphs),
	'tables_count': len(doc.tables),
	'sections_count': len(doc.sections),
	'analysis_type': 'docx'
	}

	except Exception as e:
	return {'error': 'cannot_read_docx', 'exception': str(e), 'analysis_type': 'docx'}


	def analyze_xlsx(path_or_stream):
	"""تحليل ملفات Excel"""
	if not EXCEL_AVAILABLE:
	return {'error': 'xlsx_support_not_available', 'analysis_type': 'xlsx'}

	try:
	wb = load_workbook(
	filename=path_or_stream,
	read_only=True,
	data_only=True,
	keep_vba=False # منع تنفيذ الماكرو
	)

	sheets = wb.sheetnames
	sheet_summaries = {}
	all_data = []

	for name in sheets:
	ws = wb[name]
	rows = []
	count = 0

	for row in ws.iter_rows(values_only=True, max_row=200): # أول 200 صف فقط
	row_data = [str(c) if c is not None else '' for c in row]
	rows.append(row_data)
	all_data.extend([str(c) for c in row if c is not None])
	count += 1
	if count >= 200:
	break

	sheet_summaries[name] = {
	'sample_rows': rows[:10], # أول 10 صفوف فقط للعرض
	'sample_row_count': len(rows),
	'max_column': ws.max_column,
	'max_row': ws.max_row
	}

	# تحليل النص المجمع
	all_text = ' '.join(all_data)
	stats = get_text_stats(all_text)

	return {
	'sheets': sheets,
	'sheet_summaries': sheet_summaries,
	'stats': stats,
	'analysis_type': 'xlsx'
	}

	except Exception as e:
	return {'error': 'cannot_read_xlsx', 'exception': str(e), 'analysis_type': 'xlsx'}


	def analyze_pptx(path_or_stream):
	"""تحليل ملفات PowerPoint"""
	if not PPTX_AVAILABLE:
	return {'error': 'pptx_support_not_available', 'analysis_type': 'pptx'}

	try:
	prs = Presentation(path_or_stream)
	slides = []
	all_texts = []

	for i, slide in enumerate(prs.slides):
	texts = []
	for shape in slide.shapes:
	if hasattr(shape, "text") and shape.text and shape.text.strip():
	text_content = shape.text.strip()
	texts.append(text_content)
	all_texts.append(text_content)

	slides.append({
	'slide_number': i + 1,
	'slide_text': '\n'.join(texts),
	'shapes_count': len([s for s in slide.shapes if hasattr(s, 'text') and s.text])
	})

	combined = '\n'.join(all_texts)
	stats = get_text_stats(combined)

	return {
	'num_slides': len(slides),
	'stats': stats,
	'slides_preview': slides[:5], # أول 5 شرائح فقط للعرض
	'total_shapes': sum(s['shapes_count'] for s in slides),
	'analysis_type': 'pptx'
	}

	except Exception as e:
	return {'error': 'cannot_read_pptx', 'exception': str(e), 'analysis_type': 'pptx'}


	def analyze_pdf(path_or_stream):
	"""تحليل ملفات PDF"""
	if not PDF_AVAILABLE:
	return {'error': 'pdf_support_not_available', 'analysis_type': 'pdf'}

	try:
	reader = PdfReader(path_or_stream)
	texts = []
	metadata = {}

	# استخراج البيانات الوصفية
	if reader.metadata:
	metadata = {
	'title': reader.metadata.get('/Title', ''),
	'author': reader.metadata.get('/Author', ''),
	'subject': reader.metadata.get('/Subject', ''),
	'creator': reader.metadata.get('/Creator', ''),
	'producer': reader.metadata.get('/Producer', ''),
	'creation_date': reader.metadata.get('/CreationDate', '')
	}

	# استخراج النص من الصفحات
	for i, page in enumerate(reader.pages):
	try:
	page_text = page.extract_text() or ''
	texts.append(page_text)
	except Exception as e:
	logger.warning(f"Failed to extract text from PDF page {i+1}: {str(e)}")
	texts.append('')

	all_text = '\n'.join(texts)
	stats = get_text_stats(all_text)

	return {
	'num_pages': len(reader.pages),
	'stats': stats,
	'text_preview': all_text[:10000],
	'pdf_metadata': metadata,
	'encrypted': reader.is_encrypted,
	'analysis_type': 'pdf'
	}

	except Exception as e:
	return {'error': 'cannot_read_pdf', 'exception': str(e), 'analysis_type': 'pdf'}


	@app.route('/')
	def index():
	"""الصفحة الرئيسية"""
	supported_formats = ", ".join(sorted(ALLOWED_EXTENSIONS))
	return f"""
	<html>
	<head>
	<title>Simple File Analyzer</title>
	<style>
	body {{ font-family: Arial, sans-serif; margin: 40px; }}
	.container {{ max-width: 800px; margin: 0 auto; }}
	.upload-form {{ border: 2px dashed #ccc; padding: 20px; text-align: center; }}
	.info {{ background: #f0f8ff; padding: 15px; border-radius: 5px; }}
	.warning {{ background: #fffacd; padding: 10px; border-radius: 5px; margin: 10px 0; }}
	</style>
	</head>
	<body>
	<div class="container">
	<h1>📁 Simple File Analyzer</h1>
	<div class="info">
	<p><strong>Supported formats:</strong> {supported_formats}</p>
	<p><strong>Max file size:</strong> 200MB</p>
	</div>
	{"<div class='warning'><strong>Note:</strong> Some file types may not be available due to missing dependencies</div>" if len(ALLOWED_EXTENSIONS) < 7 else ""}
	<div class="upload-form">
	<h3>Upload a File for Analysis</h3>
	<form action="/upload" method="post" enctype="multipart/form-data">
	<input type="file" name="file" required>
	<br><br>
	<input type="submit" value="Analyze File" style="padding: 10px 20px;">
	</form>
	</div>
	<p><a href="/health">Health Check</a> \| <a href="/supported">Supported Formats</a></p>
	</div>
	</body>
	</html>
	"""


	@app.route('/supported')
	def supported_formats():
	"""عرض الصيغ المدعومة والمكتبات المثبتة"""
	libraries_status = {
	'python-docx (Word)': DOCX_AVAILABLE,
	'openpyxl (Excel)': EXCEL_AVAILABLE,
	'python-pptx (PowerPoint)': PPTX_AVAILABLE,
	'PyPDF2 (PDF)': PDF_AVAILABLE,
	'langdetect (Language Detection)': LANGDETECT_AVAILABLE
	}

	return jsonify({
	'supported_extensions': sorted(list(ALLOWED_EXTENSIONS)),
	'libraries_status': libraries_status,
	'max_file_size_mb': MAX_FILE_SIZE / (1024 * 1024)
	})


	@app.route('/health')
	def health_check():
	"""فحص صحة الخدمة"""
	return jsonify({
	'status': 'healthy',
	'timestamp': datetime.now().isoformat(),
	'service': 'File Analyzer',
	'supported_formats_count': len(ALLOWED_EXTENSIONS)
	})


	@app.route('/upload', methods=['POST'])
	def upload_file():
	"""معالجة رفع الملفات"""
	try:
	if 'file' not in request.files:
	return jsonify({'error': 'no_file_part'}), 400

	file = request.files['file']
	if file.filename == '':
	return jsonify({'error': 'no_selected_file'}), 400

	if not file or not allowed_file(file.filename):
	return jsonify({
	'error': 'file_type_not_allowed',
	'allowed_extensions': list(ALLOWED_EXTENSIONS),
	'your_file_extension': file.filename.rsplit('.', 1)[1].lower() if '.' in file.filename else 'unknown'
	}), 400

	# تأمين اسم الملف
	filename = secure_filename(file.filename)
	ext = filename.rsplit('.', 1)[1].lower()
	save_path = os.path.join(app.config['UPLOAD_FOLDER'], filename)

	# حفظ الملف
	file.stream.seek(0)
	file.save(save_path)

	# التحقق من حجم الملف
	size = os.path.getsize(save_path)
	if size > MAX_FILE_SIZE:
	os.remove(save_path)
	return jsonify({'error': 'file_too_large', 'max_size_mb': MAX_FILE_SIZE / (1024 * 1024)}), 413

	# التحقق من هجمات Zip Bomb للملفات المضغوطة
	if ext in ['docx', 'xlsx', 'pptx']:
	try:
	check_zip_bomb(save_path)
	except ValueError as e:
	os.remove(save_path)
	return jsonify({'error': 'security_risk', 'message': str(e)}), 400

	# استخراج البيانات الوصفية
	metadata = extract_metadata(save_path)

	# تحليل الملف بناءً على الامتداد
	analysis_functions = {
	'txt': analyze_txt,
	'json': analyze_json,
	'xml': analyze_xml
	}

	# إضافة الدوال المتوفرة فقط
	if DOCX_AVAILABLE:
	analysis_functions['docx'] = analyze_docx
	if EXCEL_AVAILABLE:
	analysis_functions['xlsx'] = analyze_xlsx
	if PPTX_AVAILABLE:
	analysis_functions['pptx'] = analyze_pptx
	if PDF_AVAILABLE:
	analysis_functions['pdf'] = analyze_pdf

	with open(save_path, 'rb') as f:
	if ext in analysis_functions:
	result = analysis_functions[ext](f)
	else:
	result = {'error': 'unsupported_extension', 'extension': ext}

	# تنظيف الملف المؤقت (اختياري - يمكن الاحتفاظ به للتحليل المستقبلي)
	try:
	os.remove(save_path)
	except Exception as e:
	logger.warning(f"Could not remove temporary file: {str(e)}")

	response = {
	'filename': filename,
	'extension': ext,
	'size_bytes': size,
	'upload_time': datetime.now().isoformat(),
	'metadata': metadata,
	'analysis': result
	}

	logger.info(f"File analyzed successfully: {filename} ({size} bytes)")
	return jsonify(response)

	except MemoryError:
	logger.error("Memory error during file processing")
	return jsonify({'error': 'file_too_large_memory'}), 413
	except Exception as e:
	logger.error(f"Upload error: {str(e)}")
	return jsonify({'error': 'internal_server_error', 'message': str(e)}), 500


	@app.errorhandler(413)
	def too_large(e):
	"""معالجة أخطاء حجم الملف الكبير"""
	return jsonify({'error': 'file_too_large', 'max_size_mb': MAX_FILE_SIZE / (1024 * 1024)}), 413


	@app.errorhandler(500)
	def internal_error(e):
	"""معالجة الأخطاء الداخلية"""
	return jsonify({'error': 'internal_server_error'}), 500


	if __name__ == '__main__':
	logger.info("Starting File Analyzer Service...")
	logger.info(f"Upload folder: {os.path.abspath(UPLOAD_FOLDER)}")
	logger.info(f"Allowed extensions: {ALLOWED_EXTENSIONS}")
	logger.info(f"Available libraries: DOCX={DOCX_AVAILABLE}, EXCEL={EXCEL_AVAILABLE}, PPTX={PPTX_AVAILABLE}, PDF={PDF_AVAILABLE}")

	print(f"\n🎯 File Analyzer Service Ready!")
	print(f"📁 Supported formats: {', '.join(sorted(ALLOWED_EXTENSIONS))}")
	print(f"🌐 Access at: http://127.0.0.1:4580")
	print(f"💾 Upload folder: {os.path.abspath(UPLOAD_FOLDER)}")

	app.run(
	debug=True,
	host='0.0.0.0', # السماح بالوصول من أي عنوان
	port=8490
	)