bluewhale2025's picture
Fix Gradio File component and NLTK data path
83a76fb
raw
history blame
7.58 kB
import os
import sys
import logging
from pathlib import Path
from typing import List, Dict, Optional
import gradio as gr
from fastapi import FastAPI, HTTPException, status, UploadFile, File, BackgroundTasks
from fastapi.middleware.cors import CORSMiddleware
from dotenv import load_dotenv
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[
logging.StreamHandler(sys.stdout)
]
)
logger = logging.getLogger(__name__)
# Load environment variables
load_dotenv()
# Initialize FastAPI
app = FastAPI(
title="ParseAI API",
description="API for processing and analyzing PDF documents",
version="1.0.0"
)
# CORS middleware configuration
app.add_middleware(
CORSMiddleware,
allow_origins=["*"], # In production, replace with specific origins
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# Directory configuration
BASE_DIR = Path("/home/user/app/data")
UPLOAD_DIR = BASE_DIR / "uploads"
PROCESSED_DIR = BASE_DIR / "processed"
# Use system NLTK data directory that we'll populate in the Dockerfile
NLTK_DATA_DIR = Path("/usr/local/share/nltk_data")
# Ensure directories exist with proper permissions
for directory in [BASE_DIR, UPLOAD_DIR, PROCESSED_DIR]:
try:
directory.mkdir(parents=True, exist_ok=True)
# Set permissions to 0o777 (read/write/execute for all)
directory.chmod(0o777)
logger.info(f"Created directory: {directory}")
except Exception as e:
logger.error(f"Failed to create directory {directory}: {str(e)}")
# Try to continue if directory creation fails
if not directory.exists():
raise
os.chmod(directory, 0o755)
logger.info(f"Ensured directory exists: {directory}")
# Import modules after environment is set up
try:
from extractor import pdf_extractor
from summarizer import document_summarizer
from vector_store import vector_store
# Initialize NLTK data
import nltk
# Set NLTK data path - system path first, then user path
nltk.data.path = [str(NLTK_DATA_DIR)] + nltk.data.path
# Verify NLTK data is available
required_nltk_data = [
'tokenizers/punkt',
'corpora/stopwords',
'corpora/wordnet',
'taggers/averaged_perceptron_tagger'
]
for resource in required_nltk_data:
try:
nltk.data.find(resource)
logger.info(f"NLTK resource found: {resource}")
except LookupError as e:
logger.warning(f"NLTK resource not found: {resource}")
# Don't try to download at runtime - should be handled in Dockerfile
except ImportError as e:
logger.error(f"Failed to import required modules: {e}")
raise
# Health check endpoint
@app.get("/health")
async def health_check():
"""Health check endpoint for monitoring"""
return {
"status": "healthy",
"environment": os.getenv("ENV", "development"),
"nltk_data": str(NLTK_DATA_DIR),
"upload_dir": str(UPLOAD_DIR),
"processed_dir": str(PROCESSED_DIR)
}
async def process_document(file_path: str):
"""
Process a document by extracting text, summarizing it, and adding to the vector store.
Args:
file_path (str): Path to the file to process
Returns:
dict: Processing results including status, processed file path, and summary
"""
try:
logger.info(f"Processing document: {file_path}")
# PDF ํ…์ŠคํŠธ ์ถ”์ถœ
extracted_data = pdf_extractor.extract_text(file_path)
logger.info(f"Extracted text from {len(extracted_data['text_by_page'])} pages")
# ์ „์ฒด ํ…์ŠคํŠธ ์ถ”์ถœ
full_text = " ".join([page["text"] for page in extracted_data["text_by_page"]])
# ํ…์ŠคํŠธ ์š”์•ฝ
summary_result = document_summarizer.summarize_text(full_text)
logger.info("Document summarization completed")
# ๋ฒกํ„ฐ ์ €์žฅ์†Œ์— ์ถ”๊ฐ€
metadata = {
"filename": extracted_data["filename"],
"total_pages": extracted_data["total_pages"],
"summary": summary_result.get("full_summary", ""),
"timestamp": extracted_data.get("timestamp", "")
}
vector_store.add_document(full_text, metadata)
logger.info("Document added to vector store")
# ์ฒ˜๋ฆฌ๋œ ๋ฐ์ดํ„ฐ ์ €์žฅ
processed_path = pdf_extractor.save_extracted_text(
{
**extracted_data,
"summary": summary_result.get("full_summary", ""),
"chunk_summaries": summary_result.get("chunk_summaries", [])
},
str(PROCESSED_DIR)
)
logger.info(f"Processed data saved to {processed_path}")
return {
"status": "success",
"processed_file": processed_path,
"summary": summary_result.get("full_summary", "")
}
except Exception as e:
error_msg = f"Error processing document: {str(e)}"
logger.error(error_msg, exc_info=True)
raise Exception(error_msg)
@app.post("/upload/pdf")
async def upload_pdf(
file: UploadFile = File(...),
background_tasks: BackgroundTasks = None
):
"""PDF ํŒŒ์ผ ์—…๋กœ๋“œ API"""
if not file.filename.lower().endswith('.pdf'):
raise HTTPException(status_code=400, detail="PDF ํŒŒ์ผ๋งŒ ์—…๋กœ๋“œ ๊ฐ€๋Šฅํ•ฉ๋‹ˆ๋‹ค")
file_path = UPLOAD_DIR / file.filename
try:
# ํŒŒ์ผ ์ €์žฅ
with open(file_path, "wb") as buffer:
content = await file.read()
buffer.write(content)
# ๋น„๋™๊ธฐ๋กœ ๋ฌธ์„œ ์ฒ˜๋ฆฌ ์‹œ์ž‘
background_tasks.add_task(process_document, str(file_path))
return {"filename": file.filename, "status": "processing"}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.get("/search")
async def search_documents(query: str, top_k: int = 5):
"""๋ฌธ์„œ ๊ฒ€์ƒ‰ API"""
try:
results = vector_store.search(query, top_k)
return {"results": results}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
# Gradio ์ธํ„ฐํŽ˜์ด์Šค ์ƒ์„ฑ
def process_file(file):
file_path = UPLOAD_DIR / file.name
with open(file_path, "wb") as buffer:
buffer.write(file.getbuffer())
result = process_document(str(file_path))
return result["summary"]
def search(query):
results = vector_store.search(query)
return "\n\n".join([f"{r['filename']} - ์œ ์‚ฌ๋„: {r['similarity']:.2f}" for r in results["results"]])
with gr.Blocks() as demo:
gr.Markdown("# ParseAI PDF ๋ถ„์„ ์„œ๋น„์Šค")
with gr.Tab("PDF ์—…๋กœ๋“œ"):
file_input = gr.File(type="filepath", file_types=["pdf"])
upload_button = gr.Button("์—…๋กœ๋“œ")
summary_output = gr.Textbox(label="์š”์•ฝ")
upload_button.click(
process_file,
inputs=[file_input],
outputs=[summary_output]
)
with gr.Tab("๋ฌธ์„œ ๊ฒ€์ƒ‰"):
search_input = gr.Textbox(label="๊ฒ€์ƒ‰์–ด ์ž…๋ ฅ")
search_button = gr.Button("๊ฒ€์ƒ‰")
search_output = gr.Textbox(label="๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ")
search_button.click(
search,
inputs=[search_input],
outputs=[search_output]
)
if __name__ == "__main__":
demo.launch()