File size: 9,122 Bytes
3022fd1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3c83f33
 
 
544d677
83a76fb
3022fd1
 
3c83f33
3022fd1
 
3c83f33
 
 
 
 
 
 
 
3022fd1
 
 
 
 
 
 
 
 
23e4091
3022fd1
544d677
 
23e4091
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3022fd1
 
544d677
 
 
 
29ad632
544d677
5daea2d
544d677
5daea2d
544d677
 
 
 
23e4091
 
 
 
 
 
 
 
 
 
3022fd1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a118576
9d14f12
 
3022fd1
 
9d14f12
3022fd1
 
9d14f12
3022fd1
 
9d14f12
3022fd1
9d14f12
 
 
3022fd1
9d14f12
3022fd1
 
9d14f12
 
 
3022fd1
9d14f12
3022fd1
9d14f12
 
3022fd1
9d14f12
3022fd1
 
9d14f12
 
3022fd1
9d14f12
 
 
3022fd1
9d14f12
 
3022fd1
 
 
9d14f12
3022fd1
 
 
 
9d14f12
3022fd1
 
 
9d14f12
 
 
3022fd1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a118576
ac59367
 
 
 
 
3022fd1
ac59367
 
 
 
a118576
ac59367
3022fd1
 
 
 
 
 
 
 
 
ac59367
 
 
 
 
3022fd1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
import os
import sys
import logging
from pathlib import Path
from typing import List, Dict, Optional

import gradio as gr
from fastapi import FastAPI, HTTPException, status, UploadFile, File, BackgroundTasks
from fastapi.middleware.cors import CORSMiddleware
from dotenv import load_dotenv

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.StreamHandler(sys.stdout)
    ]
)
logger = logging.getLogger(__name__)

# Load environment variables
load_dotenv()

# Initialize FastAPI
app = FastAPI(
    title="ParseAI API",
    description="API for processing and analyzing PDF documents",
    version="1.0.0"
)

# CORS middleware configuration
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],  # In production, replace with specific origins
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

# Directory configuration
BASE_DIR = Path("/home/user/app/data")
UPLOAD_DIR = BASE_DIR / "uploads"
PROCESSED_DIR = BASE_DIR / "processed"
# Use system NLTK data directory that we'll populate in the Dockerfile
NLTK_DATA_DIR = Path("/usr/local/share/nltk_data")

# Ensure directories exist with proper permissions
for directory in [BASE_DIR, UPLOAD_DIR, PROCESSED_DIR]:
    try:
        directory.mkdir(parents=True, exist_ok=True)
        # Set permissions to 0o777 (read/write/execute for all)
        directory.chmod(0o777)
        logger.info(f"Created directory: {directory}")
    except Exception as e:
        logger.error(f"Failed to create directory {directory}: {str(e)}")
        # Try to continue if directory creation fails
        if not directory.exists():
            raise
        os.chmod(directory, 0o755)
        logger.info(f"Ensured directory exists: {directory}")

# Import modules after environment is set up
try:
    from extractor import pdf_extractor
    from summarizer import document_summarizer
    from vector_store import vector_store
    
        # Initialize NLTK data
    import nltk
    
    # Set NLTK data path - system path first, then user path
    nltk_data_paths = [
        str(NLTK_DATA_DIR),
        '/usr/local/share/nltk_data',
        '/usr/share/nltk_data',
        '/usr/local/nltk_data',
        '/usr/local/share/nltk_data',
        '/usr/local/lib/nltk_data',
        '/usr/share/nltk_data',
        '/usr/local/share/nltk_data',
        '/usr/lib/nltk_data',
        '/usr/local/lib/nltk_data',
        '/root/nltk_data',
        '/home/user/nltk_data'
    ]
    
    # Add all possible NLTK data paths
    nltk.data.path = list(dict.fromkeys(nltk_data_paths + nltk.data.path))
    
    # Verify NLTK data is available
    required_nltk_data = [
        'tokenizers/punkt',
        'corpora/stopwords',
        'corpora/wordnet',
        'taggers/averaged_perceptron_tagger'
    ]

    for resource in required_nltk_data:
        try:
            nltk.data.find(resource)
            logger.info(f"NLTK resource found: {resource}")
        except LookupError as e:
            logger.warning(f"NLTK resource not found: {resource}")
            # Try to download the resource if not found
            try:
                resource_name = resource.split('/')[-1].split('.')[0]
                logger.info(f"Attempting to download NLTK resource: {resource_name}")
                nltk.download(resource_name, download_dir=str(NLTK_DATA_DIR))
                nltk.data.path.append(str(NLTK_DATA_DIR))
                nltk.data.find(resource)  # Try to find it again after download
                logger.info(f"Successfully downloaded NLTK resource: {resource}")
            except Exception as download_error:
                logger.error(f"Failed to download NLTK resource {resource}: {str(download_error)}")

except ImportError as e:
    logger.error(f"Failed to import required modules: {e}")
    raise

# Health check endpoint
@app.get("/health")
async def health_check():
    """Health check endpoint for monitoring"""
    return {
        "status": "healthy",
        "environment": os.getenv("ENV", "development"),
        "nltk_data": str(NLTK_DATA_DIR),
        "upload_dir": str(UPLOAD_DIR),
        "processed_dir": str(PROCESSED_DIR)
    }

def process_document(file_path: str):
    """
    Process a document by extracting text, summarizing it, and adding to the vector store.
    
    Args:
        file_path (str): Path to the file to process
        
    Returns:
        dict: Processing results including status, processed file path, and summary
    """
    try:
        logger.info(f"Processing document: {file_path}")
        
        # PDF ν…μŠ€νŠΈ μΆ”μΆœ
        extracted_data = pdf_extractor.extract_text(file_path)
        logger.info(f"Extracted text from {len(extracted_data['text_by_page'])} pages")
        
        # 전체 ν…μŠ€νŠΈ μΆ”μΆœ
        full_text = " ".join([page["text"] for page in extracted_data["text_by_page"]])
        
        # ν…μŠ€νŠΈ μš”μ•½
        summary_result = document_summarizer.summarize_text(full_text)
        logger.info("Document summarization completed")
        
        # 벑터 μ €μž₯μ†Œμ— μΆ”κ°€
        metadata = {
            "filename": extracted_data["filename"],
            "total_pages": extracted_data["total_pages"],
            "summary": summary_result.get("full_summary", ""),
            "timestamp": extracted_data.get("timestamp", "")
        }
        
        vector_store.add_document(full_text, metadata)
        logger.info("Document added to vector store")
        
        # 처리된 데이터 μ €μž₯
        processed_path = pdf_extractor.save_extracted_text(
            {
                **extracted_data,
                "summary": summary_result.get("full_summary", ""),
                "chunk_summaries": summary_result.get("chunk_summaries", [])
            },
            str(PROCESSED_DIR)
        )
        logger.info(f"Processed data saved to {processed_path}")
        
        return {
            "status": "success",
            "processed_file": processed_path,
            "summary": summary_result.get("full_summary", "")
        }
        
    except Exception as e:
        error_msg = f"Error processing document: {str(e)}"
        logger.error(error_msg, exc_info=True)
        raise Exception(error_msg)

@app.post("/upload/pdf")
async def upload_pdf(
    file: UploadFile = File(...),
    background_tasks: BackgroundTasks = None
):
    """PDF 파일 μ—…λ‘œλ“œ API"""
    if not file.filename.lower().endswith('.pdf'):
        raise HTTPException(status_code=400, detail="PDF 파일만 μ—…λ‘œλ“œ κ°€λŠ₯ν•©λ‹ˆλ‹€")
    
    file_path = UPLOAD_DIR / file.filename
    
    try:
        # 파일 μ €μž₯
        with open(file_path, "wb") as buffer:
            content = await file.read()
            buffer.write(content)
        
        # λΉ„λ™κΈ°λ‘œ λ¬Έμ„œ 처리 μ‹œμž‘
        background_tasks.add_task(process_document, str(file_path))
        
        return {"filename": file.filename, "status": "processing"}
        
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

@app.get("/search")
async def search_documents(query: str, top_k: int = 5):
    """λ¬Έμ„œ 검색 API"""
    try:
        results = vector_store.search(query, top_k)
        return {"results": results}
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

# GradIO μΈν„°νŽ˜μ΄μŠ€ 생성
def process_file(file_path):
    """Process the uploaded file and return the summary"""
    # file_path is already a string path from Gradio's type="filepath"
    if not file_path or not os.path.exists(file_path):
        return "νŒŒμΌμ„ 찾을 수 μ—†μŠ΅λ‹ˆλ‹€. λ‹€μ‹œ μ‹œλ„ν•΄μ£Όμ„Έμš”."
    
    try:
        result = process_document(file_path)
        return result.get("summary", "μš”μ•½μ„ 생성할 수 μ—†μŠ΅λ‹ˆλ‹€.")
    except Exception as e:
        logger.error(f"Error processing file: {str(e)}", exc_info=True)
        return f"파일 처리 쀑 였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€: {str(e)}"

def search(query):
    results = vector_store.search(query)
    return "\n\n".join([f"{r['filename']} - μœ μ‚¬λ„: {r['similarity']:.2f}" for r in results["results"]])

with gr.Blocks() as demo:
    gr.Markdown("# ParseAI PDF 뢄석 μ„œλΉ„μŠ€")
    
    with gr.Tab("PDF μ—…λ‘œλ“œ"):
        file_input = gr.File(
            label="PDF νŒŒμΌμ„ μ„ νƒν•˜μ„Έμš”",
            file_types=[".pdf"],
            type="filepath"
        )
        upload_button = gr.Button("μ—…λ‘œλ“œ")
        summary_output = gr.Textbox(label="μš”μ•½")
        
        upload_button.click(
            process_file,
            inputs=[file_input],
            outputs=[summary_output]
        )
    
    with gr.Tab("λ¬Έμ„œ 검색"):
        search_input = gr.Textbox(label="검색어 μž…λ ₯")
        search_button = gr.Button("검색")
        search_output = gr.Textbox(label="검색 κ²°κ³Ό")
        
        search_button.click(
            search,
            inputs=[search_input],
            outputs=[search_output]
        )

if __name__ == "__main__":
    demo.launch()