Spaces:

bluewhale2025
/

parseai-document-processor

Build error

App Files Files Community

parseai-document-processor / app.py

bluewhale2025

Remove punkt_tab references and update NLTK data handling

29ad632 7 months ago

raw

history blame contribute delete

9.12 kB

	import os
	import sys
	import logging
	from pathlib import Path
	from typing import List, Dict, Optional

	import gradio as gr
	from fastapi import FastAPI, HTTPException, status, UploadFile, File, BackgroundTasks
	from fastapi.middleware.cors import CORSMiddleware
	from dotenv import load_dotenv

	# Configure logging
	logging.basicConfig(
	level=logging.INFO,
	format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
	handlers=[
	logging.StreamHandler(sys.stdout)
	]
	)
	logger = logging.getLogger(__name__)

	# Load environment variables
	load_dotenv()

	# Initialize FastAPI
	app = FastAPI(
	title="ParseAI API",
	description="API for processing and analyzing PDF documents",
	version="1.0.0"
	)

	# CORS middleware configuration
	app.add_middleware(
	CORSMiddleware,
	allow_origins=["*"], # In production, replace with specific origins
	allow_credentials=True,
	allow_methods=["*"],
	allow_headers=["*"],
	)

	# Directory configuration
	BASE_DIR = Path("/home/user/app/data")
	UPLOAD_DIR = BASE_DIR / "uploads"
	PROCESSED_DIR = BASE_DIR / "processed"
	# Use system NLTK data directory that we'll populate in the Dockerfile
	NLTK_DATA_DIR = Path("/usr/local/share/nltk_data")

	# Ensure directories exist with proper permissions
	for directory in [BASE_DIR, UPLOAD_DIR, PROCESSED_DIR]:
	try:
	directory.mkdir(parents=True, exist_ok=True)
	# Set permissions to 0o777 (read/write/execute for all)
	directory.chmod(0o777)
	logger.info(f"Created directory: {directory}")
	except Exception as e:
	logger.error(f"Failed to create directory {directory}: {str(e)}")
	# Try to continue if directory creation fails
	if not directory.exists():
	raise
	os.chmod(directory, 0o755)
	logger.info(f"Ensured directory exists: {directory}")

	# Import modules after environment is set up
	try:
	from extractor import pdf_extractor
	from summarizer import document_summarizer
	from vector_store import vector_store

	# Initialize NLTK data
	import nltk

	# Set NLTK data path - system path first, then user path
	nltk_data_paths = [
	str(NLTK_DATA_DIR),
	'/usr/local/share/nltk_data',
	'/usr/share/nltk_data',
	'/usr/local/nltk_data',
	'/usr/local/share/nltk_data',
	'/usr/local/lib/nltk_data',
	'/usr/share/nltk_data',
	'/usr/local/share/nltk_data',
	'/usr/lib/nltk_data',
	'/usr/local/lib/nltk_data',
	'/root/nltk_data',
	'/home/user/nltk_data'
	]

	# Add all possible NLTK data paths
	nltk.data.path = list(dict.fromkeys(nltk_data_paths + nltk.data.path))

	# Verify NLTK data is available
	required_nltk_data = [
	'tokenizers/punkt',
	'corpora/stopwords',
	'corpora/wordnet',
	'taggers/averaged_perceptron_tagger'
	]

	for resource in required_nltk_data:
	try:
	nltk.data.find(resource)
	logger.info(f"NLTK resource found: {resource}")
	except LookupError as e:
	logger.warning(f"NLTK resource not found: {resource}")
	# Try to download the resource if not found
	try:
	resource_name = resource.split('/')[-1].split('.')[0]
	logger.info(f"Attempting to download NLTK resource: {resource_name}")
	nltk.download(resource_name, download_dir=str(NLTK_DATA_DIR))
	nltk.data.path.append(str(NLTK_DATA_DIR))
	nltk.data.find(resource) # Try to find it again after download
	logger.info(f"Successfully downloaded NLTK resource: {resource}")
	except Exception as download_error:
	logger.error(f"Failed to download NLTK resource {resource}: {str(download_error)}")

	except ImportError as e:
	logger.error(f"Failed to import required modules: {e}")
	raise

	# Health check endpoint
	@app.get("/health")
	async def health_check():
	"""Health check endpoint for monitoring"""
	return {
	"status": "healthy",
	"environment": os.getenv("ENV", "development"),
	"nltk_data": str(NLTK_DATA_DIR),
	"upload_dir": str(UPLOAD_DIR),
	"processed_dir": str(PROCESSED_DIR)
	}

	def process_document(file_path: str):
	"""
	Process a document by extracting text, summarizing it, and adding to the vector store.

	Args:
	file_path (str): Path to the file to process

	Returns:
	dict: Processing results including status, processed file path, and summary
	"""
	try:
	logger.info(f"Processing document: {file_path}")

	# PDF 텍스트 추출
	extracted_data = pdf_extractor.extract_text(file_path)
	logger.info(f"Extracted text from {len(extracted_data['text_by_page'])} pages")

	# 전체 텍스트 추출
	full_text = " ".join([page["text"] for page in extracted_data["text_by_page"]])

	# 텍스트 요약
	summary_result = document_summarizer.summarize_text(full_text)
	logger.info("Document summarization completed")

	# 벡터 저장소에 추가
	metadata = {
	"filename": extracted_data["filename"],
	"total_pages": extracted_data["total_pages"],
	"summary": summary_result.get("full_summary", ""),
	"timestamp": extracted_data.get("timestamp", "")
	}

	vector_store.add_document(full_text, metadata)
	logger.info("Document added to vector store")

	# 처리된 데이터 저장
	processed_path = pdf_extractor.save_extracted_text(
	{
	**extracted_data,
	"summary": summary_result.get("full_summary", ""),
	"chunk_summaries": summary_result.get("chunk_summaries", [])
	},
	str(PROCESSED_DIR)
	)
	logger.info(f"Processed data saved to {processed_path}")

	return {
	"status": "success",
	"processed_file": processed_path,
	"summary": summary_result.get("full_summary", "")
	}

	except Exception as e:
	error_msg = f"Error processing document: {str(e)}"
	logger.error(error_msg, exc_info=True)
	raise Exception(error_msg)

	@app.post("/upload/pdf")
	async def upload_pdf(
	file: UploadFile = File(...),
	background_tasks: BackgroundTasks = None
	):
	"""PDF 파일 업로드 API"""
	if not file.filename.lower().endswith('.pdf'):
	raise HTTPException(status_code=400, detail="PDF 파일만 업로드 가능합니다")

	file_path = UPLOAD_DIR / file.filename

	try:
	# 파일 저장
	with open(file_path, "wb") as buffer:
	content = await file.read()
	buffer.write(content)

	# 비동기로 문서 처리 시작
	background_tasks.add_task(process_document, str(file_path))

	return {"filename": file.filename, "status": "processing"}

	except Exception as e:
	raise HTTPException(status_code=500, detail=str(e))

	@app.get("/search")
	async def search_documents(query: str, top_k: int = 5):
	"""문서 검색 API"""
	try:
	results = vector_store.search(query, top_k)
	return {"results": results}
	except Exception as e:
	raise HTTPException(status_code=500, detail=str(e))

	# GradIO 인터페이스 생성
	def process_file(file_path):
	"""Process the uploaded file and return the summary"""
	# file_path is already a string path from Gradio's type="filepath"
	if not file_path or not os.path.exists(file_path):
	return "파일을 찾을 수 없습니다. 다시 시도해주세요."

	try:
	result = process_document(file_path)
	return result.get("summary", "요약을 생성할 수 없습니다.")
	except Exception as e:
	logger.error(f"Error processing file: {str(e)}", exc_info=True)
	return f"파일 처리 중 오류가 발생했습니다: {str(e)}"

	def search(query):
	results = vector_store.search(query)
	return "\n\n".join([f"{r['filename']} - 유사도: {r['similarity']:.2f}" for r in results["results"]])

	with gr.Blocks() as demo:
	gr.Markdown("# ParseAI PDF 분석 서비스")

	with gr.Tab("PDF 업로드"):
	file_input = gr.File(
	label="PDF 파일을 선택하세요",
	file_types=[".pdf"],
	type="filepath"
	)
	upload_button = gr.Button("업로드")
	summary_output = gr.Textbox(label="요약")

	upload_button.click(
	process_file,
	inputs=[file_input],
	outputs=[summary_output]
	)

	with gr.Tab("문서 검색"):
	search_input = gr.Textbox(label="검색어 입력")
	search_button = gr.Button("검색")
	search_output = gr.Textbox(label="검색 결과")

	search_button.click(
	search,
	inputs=[search_input],
	outputs=[search_output]
	)

	if __name__ == "__main__":
	demo.launch()