|
|
import os |
|
|
import json |
|
|
import pytesseract |
|
|
from PIL import Image |
|
|
from pdf2image import convert_from_path |
|
|
import docx |
|
|
from datetime import datetime |
|
|
from sqlalchemy.orm import Session |
|
|
from app.db.models import Document |
|
|
from app.agent import create_agent |
|
|
|
|
|
|
|
|
SUPPORTED_CONTENT_TYPES = { |
|
|
'application/pdf': 'pdf', |
|
|
'image/jpeg': 'image', |
|
|
'image/png': 'image', |
|
|
'image/tiff': 'image', |
|
|
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx' |
|
|
} |
|
|
|
|
|
|
|
|
UPLOAD_DIR = os.path.join(os.getcwd(), 'uploads') |
|
|
os.makedirs(UPLOAD_DIR, exist_ok=True) |
|
|
|
|
|
def save_uploaded_file(file, filename): |
|
|
"""Yüklenen dosyayı kaydet""" |
|
|
file_path = os.path.join(UPLOAD_DIR, filename) |
|
|
with open(file_path, "wb") as buffer: |
|
|
buffer.write(file.file.read()) |
|
|
return file_path |
|
|
|
|
|
def extract_text_from_pdf(file_path): |
|
|
"""PDF dosyasından metin çıkar""" |
|
|
try: |
|
|
|
|
|
images = convert_from_path(file_path) |
|
|
text = "" |
|
|
|
|
|
|
|
|
for img in images: |
|
|
text += pytesseract.image_to_string(img, lang='tur') + "\n" |
|
|
|
|
|
return text |
|
|
except Exception as e: |
|
|
print(f"PDF işleme hatası: {str(e)}") |
|
|
return "" |
|
|
|
|
|
def extract_text_from_image(file_path): |
|
|
"""Görüntüden metin çıkar""" |
|
|
try: |
|
|
img = Image.open(file_path) |
|
|
text = pytesseract.image_to_string(img, lang='tur') |
|
|
return text |
|
|
except Exception as e: |
|
|
print(f"Görüntü işleme hatası: {str(e)}") |
|
|
return "" |
|
|
|
|
|
def extract_text_from_docx(file_path): |
|
|
"""DOCX dosyasından metin çıkar""" |
|
|
try: |
|
|
doc = docx.Document(file_path) |
|
|
text = "\n".join([paragraph.text for paragraph in doc.paragraphs]) |
|
|
return text |
|
|
except Exception as e: |
|
|
print(f"DOCX işleme hatası: {str(e)}") |
|
|
return "" |
|
|
|
|
|
def process_document(file_path, content_type): |
|
|
"""Belge türüne göre metin çıkar""" |
|
|
file_type = SUPPORTED_CONTENT_TYPES.get(content_type) |
|
|
|
|
|
if not file_type: |
|
|
return "Desteklenmeyen dosya türü" |
|
|
|
|
|
if file_type == 'pdf': |
|
|
return extract_text_from_pdf(file_path) |
|
|
elif file_type == 'image': |
|
|
return extract_text_from_image(file_path) |
|
|
elif file_type == 'docx': |
|
|
return extract_text_from_docx(file_path) |
|
|
|
|
|
return "" |
|
|
|
|
|
def analyze_document_content(content, db: Session): |
|
|
"""Belge içeriğini analiz et""" |
|
|
try: |
|
|
|
|
|
agent_executor = create_agent(db) |
|
|
|
|
|
|
|
|
prompt = f"""Bu belgeyi analiz et ve aşağıdaki bilgileri çıkar: |
|
|
1. Belgedeki maliyet hesaplamalarıyla ilgili tüm bilgiler |
|
|
2. İşçilik maliyetleri |
|
|
3. Malzeme maliyetleri |
|
|
4. Kar marjı bilgileri |
|
|
5. Toplam maliyet |
|
|
|
|
|
Belge içeriği: |
|
|
{content} |
|
|
""" |
|
|
|
|
|
|
|
|
result = agent_executor.invoke({"input": prompt}) |
|
|
analysis = result["output"] |
|
|
|
|
|
|
|
|
return json.dumps({ |
|
|
"analysis": analysis, |
|
|
"analyzed_at": datetime.now().isoformat() |
|
|
}, ensure_ascii=False) |
|
|
except Exception as e: |
|
|
print(f"Analiz hatası: {str(e)}") |
|
|
return json.dumps({ |
|
|
"error": f"Analiz sırasında hata oluştu: {str(e)}", |
|
|
"analyzed_at": datetime.now().isoformat() |
|
|
}, ensure_ascii=False) |
|
|
|
|
|
def save_document_to_db(db: Session, filename, content_type, file_path, file_size, content_text=None, analysis_result=None): |
|
|
"""Belge bilgilerini veritabanına kaydet""" |
|
|
document = Document( |
|
|
filename=filename, |
|
|
content_type=content_type, |
|
|
file_path=file_path, |
|
|
file_size=file_size, |
|
|
content_text=content_text, |
|
|
analysis_result=analysis_result |
|
|
) |
|
|
|
|
|
if analysis_result: |
|
|
document.analyzed_at = datetime.now() |
|
|
|
|
|
db.add(document) |
|
|
db.commit() |
|
|
db.refresh(document) |
|
|
return document |