industrial_cost_calculation_language_model / app /document_processor.py

FilozofMuhendis

Initial model upload - Industrial Cost Calculation Chatbot

8223b74 verified 6 months ago

4.12 kB

	import os
	import json
	import pytesseract
	from PIL import Image
	from pdf2image import convert_from_path
	import docx
	from datetime import datetime
	from sqlalchemy.orm import Session
	from app.db.models import Document
	from app.agent import create_agent

	# Desteklenen dosya türleri
	SUPPORTED_CONTENT_TYPES = {
	'application/pdf': 'pdf',
	'image/jpeg': 'image',
	'image/png': 'image',
	'image/tiff': 'image',
	'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx'
	}

	# Dosya yükleme dizini
	UPLOAD_DIR = os.path.join(os.getcwd(), 'uploads')
	os.makedirs(UPLOAD_DIR, exist_ok=True)

	def save_uploaded_file(file, filename):
	"""Yüklenen dosyayı kaydet"""
	file_path = os.path.join(UPLOAD_DIR, filename)
	with open(file_path, "wb") as buffer:
	buffer.write(file.file.read())
	return file_path

	def extract_text_from_pdf(file_path):
	"""PDF dosyasından metin çıkar"""
	try:
	# PDF'i görüntülere dönüştür
	images = convert_from_path(file_path)
	text = ""

	# Her sayfadan metin çıkar
	for img in images:
	text += pytesseract.image_to_string(img, lang='tur') + "\n"

	return text
	except Exception as e:
	print(f"PDF işleme hatası: {str(e)}")
	return ""

	def extract_text_from_image(file_path):
	"""Görüntüden metin çıkar"""
	try:
	img = Image.open(file_path)
	text = pytesseract.image_to_string(img, lang='tur')
	return text
	except Exception as e:
	print(f"Görüntü işleme hatası: {str(e)}")
	return ""

	def extract_text_from_docx(file_path):
	"""DOCX dosyasından metin çıkar"""
	try:
	doc = docx.Document(file_path)
	text = "\n".join([paragraph.text for paragraph in doc.paragraphs])
	return text
	except Exception as e:
	print(f"DOCX işleme hatası: {str(e)}")
	return ""

	def process_document(file_path, content_type):
	"""Belge türüne göre metin çıkar"""
	file_type = SUPPORTED_CONTENT_TYPES.get(content_type)

	if not file_type:
	return "Desteklenmeyen dosya türü"

	if file_type == 'pdf':
	return extract_text_from_pdf(file_path)
	elif file_type == 'image':
	return extract_text_from_image(file_path)
	elif file_type == 'docx':
	return extract_text_from_docx(file_path)

	return ""

	def analyze_document_content(content, db: Session):
	"""Belge içeriğini analiz et"""
	try:
	# Agent oluştur
	agent_executor = create_agent(db)

	# Analiz için prompt
	prompt = f"""Bu belgeyi analiz et ve aşağıdaki bilgileri çıkar:
	1. Belgedeki maliyet hesaplamalarıyla ilgili tüm bilgiler
	2. İşçilik maliyetleri
	3. Malzeme maliyetleri
	4. Kar marjı bilgileri
	5. Toplam maliyet

	Belge içeriği:
	{content}
	"""

	# Agent'ı çalıştır
	result = agent_executor.invoke({"input": prompt})
	analysis = result["output"]

	# Analiz sonucunu JSON formatında döndür
	return json.dumps({
	"analysis": analysis,
	"analyzed_at": datetime.now().isoformat()
	}, ensure_ascii=False)
	except Exception as e:
	print(f"Analiz hatası: {str(e)}")
	return json.dumps({
	"error": f"Analiz sırasında hata oluştu: {str(e)}",
	"analyzed_at": datetime.now().isoformat()
	}, ensure_ascii=False)

	def save_document_to_db(db: Session, filename, content_type, file_path, file_size, content_text=None, analysis_result=None):
	"""Belge bilgilerini veritabanına kaydet"""
	document = Document(
	filename=filename,
	content_type=content_type,
	file_path=file_path,
	file_size=file_size,
	content_text=content_text,
	analysis_result=analysis_result
	)

	if analysis_result:
	document.analyzed_at = datetime.now()

	db.add(document)
	db.commit()
	db.refresh(document)
	return document