Spaces:
Build error
Build error
Commit
Β·
3022fd1
1
Parent(s):
6e46f97
Initial commit: Add ParseAI document processor application
Browse files- .gitignore +44 -0
- Dockerfile +54 -0
- README.md +88 -8
- app.py +255 -0
- download_nltk_data.py +26 -0
- extractor.py +57 -0
- requirements.txt +21 -0
- space.yml +11 -0
- start.sh +20 -0
- summarizer.py +120 -0
- vector_store.py +162 -0
.gitignore
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
*.so
|
| 6 |
+
.Python
|
| 7 |
+
env/
|
| 8 |
+
venv/
|
| 9 |
+
ENV/
|
| 10 |
+
env.bak/
|
| 11 |
+
venv.bak/
|
| 12 |
+
|
| 13 |
+
# Environment variables
|
| 14 |
+
.env
|
| 15 |
+
|
| 16 |
+
# Local development
|
| 17 |
+
.idea/
|
| 18 |
+
.vscode/
|
| 19 |
+
*.swp
|
| 20 |
+
*.swo
|
| 21 |
+
|
| 22 |
+
# Data directories
|
| 23 |
+
/data/
|
| 24 |
+
/app/data/
|
| 25 |
+
/app/nltk_data/
|
| 26 |
+
/app/huggingface_cache/
|
| 27 |
+
|
| 28 |
+
# Logs
|
| 29 |
+
*.log
|
| 30 |
+
|
| 31 |
+
# OS generated files
|
| 32 |
+
.DS_Store
|
| 33 |
+
.DS_Store?
|
| 34 |
+
._*
|
| 35 |
+
.Spotlight-V100
|
| 36 |
+
.Trashes
|
| 37 |
+
ehthumbs.db
|
| 38 |
+
Thumbs.db
|
| 39 |
+
|
| 40 |
+
# Local test files
|
| 41 |
+
*.pdf
|
| 42 |
+
*.md
|
| 43 |
+
run_local.sh
|
| 44 |
+
create_test_pdf.py
|
Dockerfile
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.9-slim
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
|
| 5 |
+
# Install system dependencies
|
| 6 |
+
RUN apt-get update && apt-get install -y \
|
| 7 |
+
build-essential \
|
| 8 |
+
curl \
|
| 9 |
+
libssl-dev \
|
| 10 |
+
libffi-dev \
|
| 11 |
+
python3-dev \
|
| 12 |
+
python3-pip \
|
| 13 |
+
git \
|
| 14 |
+
poppler-utils \
|
| 15 |
+
tesseract-ocr \
|
| 16 |
+
tesseract-ocr-eng \
|
| 17 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 18 |
+
|
| 19 |
+
# Set environment variables
|
| 20 |
+
ENV PYTHONUNBUFFERED=1
|
| 21 |
+
ENV NLTK_DATA=/app/nltk_data
|
| 22 |
+
ENV HUGGINGFACE_HUB_CACHE=/app/huggingface_cache
|
| 23 |
+
ENV UPLOAD_FOLDER=/app/data/uploads
|
| 24 |
+
|
| 25 |
+
# Copy requirements first to leverage Docker cache
|
| 26 |
+
COPY requirements.txt .
|
| 27 |
+
|
| 28 |
+
# Install Python dependencies
|
| 29 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 30 |
+
|
| 31 |
+
# Copy application files
|
| 32 |
+
COPY . .
|
| 33 |
+
|
| 34 |
+
# Make scripts executable
|
| 35 |
+
RUN chmod +x /app/start.sh /app/download_nltk_data.py
|
| 36 |
+
|
| 37 |
+
# Switch to non-root user
|
| 38 |
+
RUN useradd -m -u 1000 user && \
|
| 39 |
+
chown -R user:user /app
|
| 40 |
+
|
| 41 |
+
USER user
|
| 42 |
+
|
| 43 |
+
# Set working directory
|
| 44 |
+
WORKDIR /app
|
| 45 |
+
|
| 46 |
+
# Expose the port the app runs on
|
| 47 |
+
EXPOSE 7860
|
| 48 |
+
|
| 49 |
+
# Health check
|
| 50 |
+
HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \
|
| 51 |
+
CMD curl -f http://localhost:7860/ || exit 1
|
| 52 |
+
|
| 53 |
+
# Command to run the application
|
| 54 |
+
CMD ["/app/start.sh"]
|
README.md
CHANGED
|
@@ -1,11 +1,91 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
-
sdk:
|
| 7 |
-
|
| 8 |
-
|
|
|
|
| 9 |
---
|
| 10 |
|
| 11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: ParseAI Document Processor
|
| 3 |
+
emoji: π
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: indigo
|
| 6 |
+
sdk: gradio
|
| 7 |
+
sdk_version: 5.30.0
|
| 8 |
+
app_file: app.py
|
| 9 |
+
pinned: true
|
| 10 |
---
|
| 11 |
|
| 12 |
+
# ParseAI - Document Processing and Analysis
|
| 13 |
+
|
| 14 |
+
ParseAIλ PDF λ¬Έμλ₯Ό μ²λ¦¬νκ³ λΆμνκΈ° μν κ°λ ₯ν λꡬμ
λλ€. λ¬Έμμμ ν
μ€νΈλ₯Ό μΆμΆνκ³ , μμ½νλ©°, λ²‘ν° κ²μμ ν΅ν΄ κ΄λ ¨ λ¬Έμλ₯Ό μ°Ύμ μ μμ΅λλ€.
|
| 15 |
+
|
| 16 |
+
## π μ£Όμ κΈ°λ₯
|
| 17 |
+
|
| 18 |
+
- PDF λ¬Έμ μ
λ‘λ λ° ν
μ€νΈ μΆμΆ
|
| 19 |
+
- λ¬Έμ λ΄μ© μμ½
|
| 20 |
+
- λ²‘ν° κΈ°λ° λ¬Έμ κ²μ
|
| 21 |
+
- Gradio κΈ°λ°μ μ¬μ©μ μΉνμ μΈ μΉ μΈν°νμ΄μ€
|
| 22 |
+
|
| 23 |
+
## π οΈ κΈ°μ μ€ν
|
| 24 |
+
|
| 25 |
+
- **Backend**: FastAPI
|
| 26 |
+
- **Frontend**: Gradio
|
| 27 |
+
- **NLP**: NLTK, Hugging Face Transformers
|
| 28 |
+
- **Vector Store**: Sentence Transformers
|
| 29 |
+
- **Container**: Docker
|
| 30 |
+
|
| 31 |
+
## π λ‘컬μμ μ€ννκΈ°
|
| 32 |
+
|
| 33 |
+
### μ¬μ μꡬμ¬ν
|
| 34 |
+
|
| 35 |
+
- Docker λ° Docker Compose
|
| 36 |
+
- Python 3.9+
|
| 37 |
+
|
| 38 |
+
### νκ²½ λ³μ μ€μ
|
| 39 |
+
|
| 40 |
+
`.env` νμΌμ μμ±νκ³ λ€μ λ³μλ€μ μ€μ νμΈμ:
|
| 41 |
+
|
| 42 |
+
```bash
|
| 43 |
+
# Hugging Face Hub configuration
|
| 44 |
+
HUGGINGFACE_HUB_TOKEN=your_hf_token_here
|
| 45 |
+
|
| 46 |
+
# Application configuration
|
| 47 |
+
UPLOAD_FOLDER=/app/data/uploads
|
| 48 |
+
NLTK_DATA=/app/nltk_data
|
| 49 |
+
```
|
| 50 |
+
|
| 51 |
+
### Dockerλ₯Ό μ¬μ©ν μ€ν
|
| 52 |
+
|
| 53 |
+
1. Docker μ΄λ―Έμ§ λΉλ:
|
| 54 |
+
```bash
|
| 55 |
+
docker build -t parseai .
|
| 56 |
+
```
|
| 57 |
+
|
| 58 |
+
2. 컨ν
μ΄λ μ€ν:
|
| 59 |
+
```bash
|
| 60 |
+
docker run -d -p 7860:7860 --env-file .env parseai
|
| 61 |
+
```
|
| 62 |
+
|
| 63 |
+
3. μΉ λΈλΌμ°μ μμ μ μ:
|
| 64 |
+
```
|
| 65 |
+
http://localhost:7860
|
| 66 |
+
```
|
| 67 |
+
|
| 68 |
+
## π Hugging Face Spacesμ λ°°ν¬νκΈ°
|
| 69 |
+
|
| 70 |
+
1. μ΄ μ μ₯μλ₯Ό Hugging Face Spacesμ νΈμν©λλ€.
|
| 71 |
+
2. μ μ₯μ μ€μ μμ λ€μ νκ²½ λ³μλ₯Ό μ€μ νμΈμ:
|
| 72 |
+
- `HUGGINGFACE_HUB_TOKEN`: Hugging Face API ν ν°
|
| 73 |
+
- `UPLOAD_FOLDER`: `/app/data/uploads`
|
| 74 |
+
- `NLTK_DATA`: `/app/nltk_data`
|
| 75 |
+
|
| 76 |
+
## π μ¬μ© λ°©λ²
|
| 77 |
+
|
| 78 |
+
1. **λ¬Έμ μ
λ‘λ** νμμ PDF νμΌμ μ
λ‘λνμΈμ.
|
| 79 |
+
2. **λ¬Έμ κ²μ** νμμ ν€μλλ₯Ό μ
λ ₯νμ¬ κ΄λ ¨ λ¬Έμλ₯Ό κ²μνμΈμ.
|
| 80 |
+
|
| 81 |
+
## π μν νμΈ
|
| 82 |
+
|
| 83 |
+
μ ν리μΌμ΄μ
μνλ λ€μ μλν¬μΈνΈμμ νμΈν μ μμ΅λλ€:
|
| 84 |
+
|
| 85 |
+
```
|
| 86 |
+
GET /health
|
| 87 |
+
```
|
| 88 |
+
|
| 89 |
+
## π λΌμ΄μ μ€
|
| 90 |
+
|
| 91 |
+
μ΄ νλ‘μ νΈλ [MIT λΌμ΄μ μ€](LICENSE) νμ λ°°ν¬λ©λλ€.
|
app.py
ADDED
|
@@ -0,0 +1,255 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import sys
|
| 3 |
+
import logging
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
from typing import List, Dict, Optional
|
| 6 |
+
|
| 7 |
+
import gradio as gr
|
| 8 |
+
from fastapi import FastAPI, HTTPException, status, UploadFile, File, BackgroundTasks
|
| 9 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 10 |
+
from dotenv import load_dotenv
|
| 11 |
+
|
| 12 |
+
# Configure logging
|
| 13 |
+
logging.basicConfig(
|
| 14 |
+
level=logging.INFO,
|
| 15 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
| 16 |
+
handlers=[
|
| 17 |
+
logging.StreamHandler(sys.stdout)
|
| 18 |
+
]
|
| 19 |
+
)
|
| 20 |
+
logger = logging.getLogger(__name__)
|
| 21 |
+
|
| 22 |
+
# Load environment variables
|
| 23 |
+
load_dotenv()
|
| 24 |
+
|
| 25 |
+
# Initialize FastAPI
|
| 26 |
+
app = FastAPI(
|
| 27 |
+
title="ParseAI API",
|
| 28 |
+
description="API for processing and analyzing PDF documents",
|
| 29 |
+
version="1.0.0"
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
# CORS middleware configuration
|
| 33 |
+
app.add_middleware(
|
| 34 |
+
CORSMiddleware,
|
| 35 |
+
allow_origins=["*"], # In production, replace with specific origins
|
| 36 |
+
allow_credentials=True,
|
| 37 |
+
allow_methods=["*"],
|
| 38 |
+
allow_headers=["*"],
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
# Directory configuration
|
| 42 |
+
UPLOAD_DIR = Path(os.getenv("UPLOAD_FOLDER", "/app/data/uploads"))
|
| 43 |
+
PROCESSED_DIR = Path(os.getenv("PROCESSED_FOLDER", "/app/data/processed"))
|
| 44 |
+
NLTK_DATA_DIR = Path(os.getenv("NLTK_DATA", "/app/nltk_data"))
|
| 45 |
+
|
| 46 |
+
# Ensure directories exist with proper permissions
|
| 47 |
+
for directory in [UPLOAD_DIR, PROCESSED_DIR, NLTK_DATA_DIR]:
|
| 48 |
+
try:
|
| 49 |
+
directory.mkdir(parents=True, exist_ok=True)
|
| 50 |
+
os.chmod(directory, 0o755)
|
| 51 |
+
logger.info(f"Ensured directory exists: {directory}")
|
| 52 |
+
except Exception as e:
|
| 53 |
+
logger.error(f"Failed to create directory {directory}: {e}")
|
| 54 |
+
raise
|
| 55 |
+
|
| 56 |
+
# Import modules after environment is set up
|
| 57 |
+
try:
|
| 58 |
+
from extractor import pdf_extractor
|
| 59 |
+
from summarizer import document_summarizer
|
| 60 |
+
from vector_store import vector_store
|
| 61 |
+
|
| 62 |
+
# Initialize NLTK data
|
| 63 |
+
import nltk
|
| 64 |
+
nltk.data.path.append(str(NLTK_DATA_DIR))
|
| 65 |
+
|
| 66 |
+
# Verify NLTK data is available
|
| 67 |
+
try:
|
| 68 |
+
nltk.data.find('tokenizers/punkt')
|
| 69 |
+
nltk.data.find('corpora/stopwords')
|
| 70 |
+
nltk.data.find('corpora/wordnet')
|
| 71 |
+
nltk.data.find('taggers/averaged_perceptron_tagger')
|
| 72 |
+
logger.info("NLTK data verified successfully")
|
| 73 |
+
except LookupError as e:
|
| 74 |
+
logger.warning(f"NLTK data missing: {e}")
|
| 75 |
+
# Attempt to download missing data
|
| 76 |
+
try:
|
| 77 |
+
nltk.download('punkt', download_dir=str(NLTK_DATA_DIR))
|
| 78 |
+
nltk.download('stopwords', download_dir=str(NLTK_DATA_DIR))
|
| 79 |
+
nltk.download('wordnet', download_dir=str(NLTK_DATA_DIR))
|
| 80 |
+
nltk.download('averaged_perceptron_tagger', download_dir=str(NLTK_DATA_DIR))
|
| 81 |
+
logger.info("Successfully downloaded NLTK data")
|
| 82 |
+
except Exception as download_error:
|
| 83 |
+
logger.error(f"Failed to download NLTK data: {download_error}")
|
| 84 |
+
raise
|
| 85 |
+
|
| 86 |
+
except ImportError as e:
|
| 87 |
+
logger.error(f"Failed to import required modules: {e}")
|
| 88 |
+
raise
|
| 89 |
+
|
| 90 |
+
# Health check endpoint
|
| 91 |
+
@app.get("/health")
|
| 92 |
+
async def health_check():
|
| 93 |
+
"""Health check endpoint for monitoring"""
|
| 94 |
+
return {
|
| 95 |
+
"status": "healthy",
|
| 96 |
+
"environment": os.getenv("ENV", "development"),
|
| 97 |
+
"nltk_data": str(NLTK_DATA_DIR),
|
| 98 |
+
"upload_dir": str(UPLOAD_DIR),
|
| 99 |
+
"processed_dir": str(PROCESSED_DIR)
|
| 100 |
+
}
|
| 101 |
+
|
| 102 |
+
async def process_document(file_path: str):
|
| 103 |
+
"""Process a document as a background task.
|
| 104 |
+
|
| 105 |
+
Args:
|
| 106 |
+
file_path: Path to the uploaded file
|
| 107 |
+
|
| 108 |
+
Returns:
|
| 109 |
+
dict: Processing results including status and metadata
|
| 110 |
+
"""
|
| 111 |
+
logger.info(f"Starting document processing: {file_path}")
|
| 112 |
+
|
| 113 |
+
try:
|
| 114 |
+
# Verify file exists
|
| 115 |
+
if not os.path.exists(file_path):
|
| 116 |
+
error_msg = f"File not found: {file_path}"
|
| 117 |
+
logger.error(error_msg)
|
| 118 |
+
raise FileNotFoundError(error_msg)
|
| 119 |
+
|
| 120 |
+
# Extract text from PDF
|
| 121 |
+
logger.info(f"Extracting text from: {file_path}")
|
| 122 |
+
extracted_data = pdf_extractor.extract_text(file_path)
|
| 123 |
+
|
| 124 |
+
if not extracted_data or "text_by_page" not in extracted_data:
|
| 125 |
+
error_msg = f"Failed to extract text from: {file_path}"
|
| 126 |
+
logger.error(error_msg)
|
| 127 |
+
raise ValueError(error_msg)
|
| 128 |
+
|
| 129 |
+
# Combine text from all pages
|
| 130 |
+
full_text = " ".join([page["text"] for page in extracted_data["text_by_page"]])
|
| 131 |
+
|
| 132 |
+
if not full_text.strip():
|
| 133 |
+
error_msg = f"No text content found in: {file_path}"
|
| 134 |
+
logger.error(error_msg)
|
| 135 |
+
raise ValueError(error_msg)
|
| 136 |
+
|
| 137 |
+
# Generate summary
|
| 138 |
+
logger.info(f"Generating summary for: {file_path}")
|
| 139 |
+
try:
|
| 140 |
+
summary_result = document_summarizer.summarize_text(full_text)
|
| 141 |
+
except Exception as e:
|
| 142 |
+
logger.error(f"Error during summarization: {str(e)}")
|
| 143 |
+
summary_result = {"full_summary": "Summary generation failed", "key_points": []}
|
| 144 |
+
|
| 145 |
+
# Add to vector store
|
| 146 |
+
logger.info(f"Adding document to vector store: {file_path}")
|
| 147 |
+
metadata = {
|
| 148 |
+
"filename": os.path.basename(file_path),
|
| 149 |
+
"total_pages": extracted_data.get("total_pages", 0),
|
| 150 |
+
"summary": summary_result.get("full_summary", ""),
|
| 151 |
+
"timestamp": extracted_data.get("timestamp", ""),
|
| 152 |
+
"source": "upload"
|
| 153 |
+
}
|
| 154 |
+
|
| 155 |
+
try:
|
| 156 |
+
vector_store.add_document(full_text, metadata)
|
| 157 |
+
except Exception as e:
|
| 158 |
+
logger.error(f"Failed to add document to vector store: {str(e)}")
|
| 159 |
+
raise
|
| 160 |
+
|
| 161 |
+
# Save processed data
|
| 162 |
+
processed_path = None
|
| 163 |
+
try:
|
| 164 |
+
processed_path = pdf_extractor.save_extracted_text(
|
| 165 |
+
{
|
| 166 |
+
**extracted_data,
|
| 167 |
+
"summary": summary_result["full_summary"],
|
| 168 |
+
"chunk_summaries": summary_result["chunk_summaries"]
|
| 169 |
+
},
|
| 170 |
+
str(PROCESSED_DIR)
|
| 171 |
+
)
|
| 172 |
+
|
| 173 |
+
return {
|
| 174 |
+
"status": "success",
|
| 175 |
+
"processed_file": processed_path,
|
| 176 |
+
"summary": summary_result["full_summary"]
|
| 177 |
+
}
|
| 178 |
+
|
| 179 |
+
except Exception as e:
|
| 180 |
+
raise Exception(f"λ¬Έμ μ²λ¦¬ μ€ μ€λ₯ λ°μ: {str(e)}")
|
| 181 |
+
|
| 182 |
+
@app.post("/upload/pdf")
|
| 183 |
+
async def upload_pdf(
|
| 184 |
+
file: UploadFile = File(...),
|
| 185 |
+
background_tasks: BackgroundTasks = None
|
| 186 |
+
):
|
| 187 |
+
"""PDF νμΌ μ
λ‘λ API"""
|
| 188 |
+
if not file.filename.lower().endswith('.pdf'):
|
| 189 |
+
raise HTTPException(status_code=400, detail="PDF νμΌλ§ μ
λ‘λ κ°λ₯ν©λλ€")
|
| 190 |
+
|
| 191 |
+
file_path = UPLOAD_DIR / file.filename
|
| 192 |
+
|
| 193 |
+
try:
|
| 194 |
+
# νμΌ μ μ₯
|
| 195 |
+
with open(file_path, "wb") as buffer:
|
| 196 |
+
content = await file.read()
|
| 197 |
+
buffer.write(content)
|
| 198 |
+
|
| 199 |
+
# λΉλκΈ°λ‘ λ¬Έμ μ²λ¦¬ μμ
|
| 200 |
+
background_tasks.add_task(process_document, str(file_path))
|
| 201 |
+
|
| 202 |
+
return {"filename": file.filename, "status": "processing"}
|
| 203 |
+
|
| 204 |
+
except Exception as e:
|
| 205 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 206 |
+
|
| 207 |
+
@app.get("/search")
|
| 208 |
+
async def search_documents(query: str, top_k: int = 5):
|
| 209 |
+
"""λ¬Έμ κ²μ API"""
|
| 210 |
+
try:
|
| 211 |
+
results = vector_store.search(query, top_k)
|
| 212 |
+
return {"results": results}
|
| 213 |
+
except Exception as e:
|
| 214 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 215 |
+
|
| 216 |
+
# Gradio μΈν°νμ΄μ€ μμ±
|
| 217 |
+
def process_file(file):
|
| 218 |
+
file_path = UPLOAD_DIR / file.name
|
| 219 |
+
with open(file_path, "wb") as buffer:
|
| 220 |
+
buffer.write(file.getbuffer())
|
| 221 |
+
|
| 222 |
+
result = process_document(str(file_path))
|
| 223 |
+
return result["summary"]
|
| 224 |
+
|
| 225 |
+
def search(query):
|
| 226 |
+
results = vector_store.search(query)
|
| 227 |
+
return "\n\n".join([f"{r['filename']} - μ μ¬λ: {r['similarity']:.2f}" for r in results["results"]])
|
| 228 |
+
|
| 229 |
+
with gr.Blocks() as demo:
|
| 230 |
+
gr.Markdown("# ParseAI PDF λΆμ μλΉμ€")
|
| 231 |
+
|
| 232 |
+
with gr.Tab("PDF μ
λ‘λ"):
|
| 233 |
+
file_input = gr.File(type="file", file_types=[".pdf"])
|
| 234 |
+
upload_button = gr.Button("μ
λ‘λ")
|
| 235 |
+
summary_output = gr.Textbox(label="μμ½")
|
| 236 |
+
|
| 237 |
+
upload_button.click(
|
| 238 |
+
process_file,
|
| 239 |
+
inputs=[file_input],
|
| 240 |
+
outputs=[summary_output]
|
| 241 |
+
)
|
| 242 |
+
|
| 243 |
+
with gr.Tab("λ¬Έμ κ²μ"):
|
| 244 |
+
search_input = gr.Textbox(label="κ²μμ΄ μ
λ ₯")
|
| 245 |
+
search_button = gr.Button("κ²μ")
|
| 246 |
+
search_output = gr.Textbox(label="κ²μ κ²°κ³Ό")
|
| 247 |
+
|
| 248 |
+
search_button.click(
|
| 249 |
+
search,
|
| 250 |
+
inputs=[search_input],
|
| 251 |
+
outputs=[search_output]
|
| 252 |
+
)
|
| 253 |
+
|
| 254 |
+
if __name__ == "__main__":
|
| 255 |
+
demo.launch()
|
download_nltk_data.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
import nltk
|
| 3 |
+
import os
|
| 4 |
+
|
| 5 |
+
def download_nltk_data():
|
| 6 |
+
# Ensure the NLTK data directory exists
|
| 7 |
+
nltk_data_dir = os.getenv('NLTK_DATA', '/app/nltk_data')
|
| 8 |
+
os.makedirs(nltk_data_dir, exist_ok=True)
|
| 9 |
+
|
| 10 |
+
# Set NLTK data path
|
| 11 |
+
nltk.data.path.append(nltk_data_dir)
|
| 12 |
+
|
| 13 |
+
# Download required NLTK data
|
| 14 |
+
print("Downloading NLTK data...")
|
| 15 |
+
try:
|
| 16 |
+
nltk.download('punkt', download_dir=nltk_data_dir)
|
| 17 |
+
nltk.download('stopwords', download_dir=nltk_data_dir)
|
| 18 |
+
nltk.download('wordnet', download_dir=nltk_data_dir)
|
| 19 |
+
nltk.download('averaged_perceptron_tagger', download_dir=nltk_data_dir)
|
| 20 |
+
print("NLTK data downloaded successfully!")
|
| 21 |
+
except Exception as e:
|
| 22 |
+
print(f"Error downloading NLTK data: {e}")
|
| 23 |
+
raise
|
| 24 |
+
|
| 25 |
+
if __name__ == "__main__":
|
| 26 |
+
download_nltk_data()
|
extractor.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from PyPDF2 import PdfReader
|
| 2 |
+
from typing import Dict, List
|
| 3 |
+
import json
|
| 4 |
+
from datetime import datetime
|
| 5 |
+
import os
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
|
| 8 |
+
class PDFExtractor:
|
| 9 |
+
def __init__(self):
|
| 10 |
+
pass
|
| 11 |
+
|
| 12 |
+
def extract_text(self, file_path: str) -> Dict:
|
| 13 |
+
"""PDF νμΌμμ ν
μ€νΈ μΆμΆ"""
|
| 14 |
+
try:
|
| 15 |
+
# PDF νμΌ μ½κΈ°
|
| 16 |
+
pdf_reader = PdfReader(file_path)
|
| 17 |
+
|
| 18 |
+
# νμ΄μ§ μ νμΈ
|
| 19 |
+
total_pages = len(pdf_reader.pages)
|
| 20 |
+
|
| 21 |
+
# νμ΄μ§λ³ ν
μ€νΈ μΆμΆ
|
| 22 |
+
text_by_page = []
|
| 23 |
+
for page_num, page in enumerate(pdf_reader.pages, 1):
|
| 24 |
+
text = page.extract_text()
|
| 25 |
+
if text:
|
| 26 |
+
text_by_page.append({
|
| 27 |
+
"page_number": page_num,
|
| 28 |
+
"text": text
|
| 29 |
+
})
|
| 30 |
+
|
| 31 |
+
# κ²°κ³Ό λ°ν
|
| 32 |
+
return {
|
| 33 |
+
"filename": os.path.basename(file_path),
|
| 34 |
+
"total_pages": total_pages,
|
| 35 |
+
"extracted_pages": len(text_by_page),
|
| 36 |
+
"text_by_page": text_by_page,
|
| 37 |
+
"timestamp": datetime.now().isoformat()
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
except Exception as e:
|
| 41 |
+
raise Exception(f"PDF ν
μ€νΈ μΆμΆ μ€ μ€λ₯ λ°μ: {str(e)}")
|
| 42 |
+
|
| 43 |
+
def save_extracted_text(self, extracted_data: Dict, output_dir: str) -> str:
|
| 44 |
+
"""μΆμΆλ ν
μ€νΈλ₯Ό JSON νμΌλ‘ μ μ₯"""
|
| 45 |
+
output_dir = Path(output_dir)
|
| 46 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 47 |
+
|
| 48 |
+
filename = f"extracted_{extracted_data['filename'].split('.')[0]}.json"
|
| 49 |
+
output_path = output_dir / filename
|
| 50 |
+
|
| 51 |
+
with open(output_path, 'w', encoding='utf-8') as f:
|
| 52 |
+
json.dump(extracted_data, f, ensure_ascii=False, indent=2)
|
| 53 |
+
|
| 54 |
+
return str(output_path)
|
| 55 |
+
|
| 56 |
+
# μ±κΈν€ μΈμ€ν΄μ€ μμ±
|
| 57 |
+
pdf_extractor = PDFExtractor()
|
requirements.txt
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi==0.68.0
|
| 2 |
+
uvicorn==0.15.0
|
| 3 |
+
python-multipart==0.0.5
|
| 4 |
+
boto3>=1.34.12
|
| 5 |
+
PyPDF2>=2.0.04
|
| 6 |
+
nltk==3.6.3
|
| 7 |
+
scikit-learn==0.24.2
|
| 8 |
+
numpy>=1.21.0
|
| 9 |
+
scipy>=1.10.1
|
| 10 |
+
sentence-transformers==2.2.0
|
| 11 |
+
datasets==2.0.0
|
| 12 |
+
gradio==3.0.0
|
| 13 |
+
pytest==6.2.5
|
| 14 |
+
pytest-cov==2.12.1
|
| 15 |
+
pytest-asyncio==0.15.1
|
| 16 |
+
httpx==0.19.0
|
| 17 |
+
python-dotenv==0.19.0
|
| 18 |
+
huggingface-hub==0.31.4
|
| 19 |
+
torch>=1.9.0
|
| 20 |
+
transformers>=4.11.0
|
| 21 |
+
pandas>=1.3.0
|
space.yml
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
build:
|
| 2 |
+
dockerfile: Dockerfile
|
| 3 |
+
|
| 4 |
+
compute:
|
| 5 |
+
type: cpu-small
|
| 6 |
+
|
| 7 |
+
ports:
|
| 8 |
+
- 7860
|
| 9 |
+
|
| 10 |
+
env:
|
| 11 |
+
HF_TOKEN: ${{secrets.HF_TOKEN}}
|
start.sh
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
set -e
|
| 3 |
+
|
| 4 |
+
# Create necessary directories
|
| 5 |
+
mkdir -p /app/data/uploads
|
| 6 |
+
mkdir -p /app/nltk_data
|
| 7 |
+
mkdir -p /app/huggingface_cache
|
| 8 |
+
|
| 9 |
+
# Set permissions
|
| 10 |
+
chown -R 1000:1000 /app
|
| 11 |
+
chmod -R 755 /app
|
| 12 |
+
|
| 13 |
+
# Download NLTK data if not present
|
| 14 |
+
if [ ! -d "/app/nltk_data/tokenizers" ]; then
|
| 15 |
+
echo "Downloading NLTK data..."
|
| 16 |
+
python /app/download_nltk_data.py
|
| 17 |
+
fi
|
| 18 |
+
|
| 19 |
+
# Start the application
|
| 20 |
+
exec python /app/app.py
|
summarizer.py
ADDED
|
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import nltk
|
| 2 |
+
from typing import Dict, List
|
| 3 |
+
import json
|
| 4 |
+
from datetime import datetime
|
| 5 |
+
import heapq
|
| 6 |
+
|
| 7 |
+
class DocumentSummarizer:
|
| 8 |
+
def __init__(self):
|
| 9 |
+
# NLTK λ€μ΄λ‘λ
|
| 10 |
+
try:
|
| 11 |
+
nltk.download('punkt', download_dir='/app/nltk_data')
|
| 12 |
+
nltk.download('stopwords', download_dir='/app/nltk_data')
|
| 13 |
+
nltk.data.path.append('/app/nltk_data')
|
| 14 |
+
except Exception as e:
|
| 15 |
+
print(f"Warning: NLTK data download failed: {str(e)}")
|
| 16 |
+
|
| 17 |
+
# ν
μ€νΈ λΆν ν¬κΈ° μ€μ
|
| 18 |
+
self.chunk_size = 1000 # ν ν° κΈ°μ€
|
| 19 |
+
try:
|
| 20 |
+
self.tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
|
| 21 |
+
except Exception as e:
|
| 22 |
+
print(f"Warning: Failed to load tokenizer: {str(e)}")
|
| 23 |
+
self.tokenizer = nltk.tokenize.sent_tokenize
|
| 24 |
+
|
| 25 |
+
def summarize_text(self, text: str) -> Dict:
|
| 26 |
+
"""ν
μ€νΈλ₯Ό μμ½"""
|
| 27 |
+
try:
|
| 28 |
+
# ν
μ€νΈ λΆν
|
| 29 |
+
chunks = self._split_text(text)
|
| 30 |
+
|
| 31 |
+
# κ° λΆν μ λν΄ μμ½ μμ±
|
| 32 |
+
summaries = []
|
| 33 |
+
for chunk in chunks:
|
| 34 |
+
summary = self._summarize_chunk(chunk)
|
| 35 |
+
if summary:
|
| 36 |
+
summaries.append(summary)
|
| 37 |
+
|
| 38 |
+
return {
|
| 39 |
+
"timestamp": datetime.now().isoformat(),
|
| 40 |
+
"full_summary": " ".join(summaries),
|
| 41 |
+
"chunk_summaries": summaries
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
except Exception as e:
|
| 45 |
+
raise Exception(f"μμ½ μμ± μ€ μ€λ₯ λ°μ: {str(e)}")
|
| 46 |
+
|
| 47 |
+
def _summarize_chunk(self, text: str) -> str:
|
| 48 |
+
"""κ°λ³ ν
μ€νΈ λΆν μ μμ½"""
|
| 49 |
+
try:
|
| 50 |
+
# ν
μ€νΈ μ μ²λ¦¬
|
| 51 |
+
words = nltk.word_tokenize(text.lower())
|
| 52 |
+
sentences = nltk.sent_tokenize(text)
|
| 53 |
+
|
| 54 |
+
# λΆμ©μ΄ μ κ±°
|
| 55 |
+
stop_words = set(nltk.corpus.stopwords.words('english'))
|
| 56 |
+
words = [word for word in words if word.isalnum() and word not in stop_words]
|
| 57 |
+
|
| 58 |
+
# λ¨μ΄ λΉλμ κ³μ°
|
| 59 |
+
word_frequencies = {}
|
| 60 |
+
for word in words:
|
| 61 |
+
if word not in word_frequencies:
|
| 62 |
+
word_frequencies[word] = 1
|
| 63 |
+
else:
|
| 64 |
+
word_frequencies[word] += 1
|
| 65 |
+
|
| 66 |
+
# μ΅λ λΉλμ κ³μ°
|
| 67 |
+
max_frequency = max(word_frequencies.values())
|
| 68 |
+
|
| 69 |
+
# μ κ·νλ λΉλμ κ³μ°
|
| 70 |
+
for word in word_frequencies:
|
| 71 |
+
word_frequencies[word] = word_frequencies[word] / max_frequency
|
| 72 |
+
|
| 73 |
+
# λ¬Έμ₯ μ μ κ³μ°
|
| 74 |
+
sentence_scores = {}
|
| 75 |
+
for sentence in sentences:
|
| 76 |
+
for word, freq in word_frequencies.items():
|
| 77 |
+
if word in sentence.lower():
|
| 78 |
+
if sentence not in sentence_scores:
|
| 79 |
+
sentence_scores[sentence] = freq
|
| 80 |
+
else:
|
| 81 |
+
sentence_scores[sentence] += freq
|
| 82 |
+
|
| 83 |
+
# μμ 30%μ λ¬Έμ₯ μ ν
|
| 84 |
+
summary_sentences = heapq.nlargest(
|
| 85 |
+
int(len(sentences) * 0.3),
|
| 86 |
+
sentence_scores,
|
| 87 |
+
key=sentence_scores.get
|
| 88 |
+
)
|
| 89 |
+
|
| 90 |
+
# μμ½ μμ±
|
| 91 |
+
return " ".join(summary_sentences)
|
| 92 |
+
|
| 93 |
+
except Exception as e:
|
| 94 |
+
print(f"Chunk summarization error: {str(e)}")
|
| 95 |
+
return ""
|
| 96 |
+
|
| 97 |
+
def _split_text(self, text: str) -> List[str]:
|
| 98 |
+
"""ν
μ€νΈλ₯Ό μ μ ν ν¬κΈ°λ‘ λΆν """
|
| 99 |
+
try:
|
| 100 |
+
sentences = nltk.sent_tokenize(text)
|
| 101 |
+
chunks = []
|
| 102 |
+
current_chunk = ""
|
| 103 |
+
|
| 104 |
+
for sentence in sentences:
|
| 105 |
+
if len(current_chunk) + len(sentence) <= self.chunk_size:
|
| 106 |
+
current_chunk += " " + sentence
|
| 107 |
+
else:
|
| 108 |
+
chunks.append(current_chunk.strip())
|
| 109 |
+
current_chunk = sentence
|
| 110 |
+
|
| 111 |
+
if current_chunk:
|
| 112 |
+
chunks.append(current_chunk.strip())
|
| 113 |
+
|
| 114 |
+
return chunks
|
| 115 |
+
|
| 116 |
+
except Exception as e:
|
| 117 |
+
raise Exception(f"ν
μ€νΈ λΆν μ€ μ€λ₯ λ°μ: {str(e)}")
|
| 118 |
+
|
| 119 |
+
# μ±κΈν€ μΈμ€ν΄μ€ μμ±
|
| 120 |
+
document_summarizer = DocumentSummarizer()
|
vector_store.py
ADDED
|
@@ -0,0 +1,162 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
from typing import Dict, List
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
import numpy as np
|
| 5 |
+
from datetime import datetime
|
| 6 |
+
from sentence_transformers import SentenceTransformer
|
| 7 |
+
from huggingface_hub import HfApi
|
| 8 |
+
import os
|
| 9 |
+
|
| 10 |
+
class VectorStore:
|
| 11 |
+
def __init__(self):
|
| 12 |
+
self.documents = []
|
| 13 |
+
self.metadata = [] # λ¬Έμ λ©νλ°μ΄ν° μ μ₯
|
| 14 |
+
self.model = SentenceTransformer('all-MiniLM-L6-v2')
|
| 15 |
+
self.hf_api = HfApi()
|
| 16 |
+
self.dataset_name = "bluewhale2025/parseai_202506" # Hugging Face dataset μ΄λ¦
|
| 17 |
+
|
| 18 |
+
# λ°μ΄ν°μ
μ΄ μμΌλ©΄ μμ±
|
| 19 |
+
try:
|
| 20 |
+
self.hf_api.create_repo(
|
| 21 |
+
repo_id=self.dataset_name,
|
| 22 |
+
repo_type="dataset",
|
| 23 |
+
private=True # κ°μΈ λ°μ΄ν°μ
μΌλ‘ μ€μ
|
| 24 |
+
)
|
| 25 |
+
print(f"λ°μ΄ν°μ
{self.dataset_name} μμ± μλ£")
|
| 26 |
+
except Exception as e:
|
| 27 |
+
print(f"λ°μ΄ν°μ
μμ± μ€ μ€λ₯ λ°μ: {str(e)}")
|
| 28 |
+
|
| 29 |
+
def add_document(self, text: str, metadata: Dict) -> None:
|
| 30 |
+
"""λ¬Έμλ₯Ό μ μ₯"""
|
| 31 |
+
try:
|
| 32 |
+
# λ¬Έμ μ μ₯
|
| 33 |
+
self.documents.append(text)
|
| 34 |
+
|
| 35 |
+
# λ©νλ°μ΄ν° μ μ₯
|
| 36 |
+
metadata["timestamp"] = str(datetime.now())
|
| 37 |
+
self.metadata.append(metadata)
|
| 38 |
+
|
| 39 |
+
# λ²‘ν° μμ±
|
| 40 |
+
vector = self.model.encode(text)
|
| 41 |
+
|
| 42 |
+
# νμΌ κ²½λ‘ μ€μ
|
| 43 |
+
vector_path = f"vectors/{len(self.documents)}.npy"
|
| 44 |
+
metadata_path = f"metadata/{len(self.documents)}.json"
|
| 45 |
+
|
| 46 |
+
# μμ νμΌλ‘ μ μ₯
|
| 47 |
+
np.save(vector_path, vector)
|
| 48 |
+
with open(metadata_path, 'w', encoding='utf-8') as f:
|
| 49 |
+
json.dump(metadata, f)
|
| 50 |
+
|
| 51 |
+
# Hugging Faceμ μ
λ‘λ
|
| 52 |
+
self.hf_api.upload_file(
|
| 53 |
+
path_or_fileobj=vector_path,
|
| 54 |
+
path_in_repo=vector_path,
|
| 55 |
+
repo_id=self.dataset_name,
|
| 56 |
+
repo_type="dataset"
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
self.hf_api.upload_file(
|
| 60 |
+
path_or_fileobj=metadata_path,
|
| 61 |
+
path_in_repo=metadata_path,
|
| 62 |
+
repo_id=self.dataset_name,
|
| 63 |
+
repo_type="dataset"
|
| 64 |
+
)
|
| 65 |
+
|
| 66 |
+
# μμ νμΌ μμ
|
| 67 |
+
os.remove(vector_path)
|
| 68 |
+
os.remove(metadata_path)
|
| 69 |
+
|
| 70 |
+
except Exception as e:
|
| 71 |
+
raise Exception(f"λ¬Έμ μ μ₯ μ€ μ€λ₯ λ°μ: {str(e)}")
|
| 72 |
+
|
| 73 |
+
def search(self, query: str, top_k: int = 5) -> List[Dict]:
|
| 74 |
+
"""ν€μλ κ²μ"""
|
| 75 |
+
try:
|
| 76 |
+
# 쿼리 λ²‘ν° μμ±
|
| 77 |
+
query_vector = self.model.encode(query)
|
| 78 |
+
|
| 79 |
+
# Hugging Faceμμ λͺ¨λ λ²‘ν° λ‘λ
|
| 80 |
+
vectors = []
|
| 81 |
+
metadata = []
|
| 82 |
+
|
| 83 |
+
# λͺ¨λ λ²‘ν° νμΌ λ‘λ
|
| 84 |
+
files = self.hf_api.list_repo_files(
|
| 85 |
+
repo_id=self.dataset_name,
|
| 86 |
+
repo_type="dataset"
|
| 87 |
+
)
|
| 88 |
+
|
| 89 |
+
# νμΌ μ λ ¬ (1λΆν° μμ)
|
| 90 |
+
vector_files = sorted([f for f in files if f.startswith("vectors/")])
|
| 91 |
+
metadata_files = sorted([f for f in files if f.startswith("metadata/")])
|
| 92 |
+
|
| 93 |
+
if not vector_files or not metadata_files:
|
| 94 |
+
return []
|
| 95 |
+
|
| 96 |
+
# νμΌ λ‘λ
|
| 97 |
+
for vector_file, metadata_file in zip(vector_files, metadata_files):
|
| 98 |
+
vector = np.load(self.hf_api.download_file(
|
| 99 |
+
repo_id=self.dataset_name,
|
| 100 |
+
filename=vector_file,
|
| 101 |
+
repo_type="dataset"
|
| 102 |
+
))
|
| 103 |
+
vectors.append(vector)
|
| 104 |
+
|
| 105 |
+
meta = json.load(self.hf_api.download_file(
|
| 106 |
+
repo_id=self.dataset_name,
|
| 107 |
+
filename=metadata_file,
|
| 108 |
+
repo_type="dataset"
|
| 109 |
+
))
|
| 110 |
+
metadata.append(meta)
|
| 111 |
+
|
| 112 |
+
# μ μ¬λ κ³μ°
|
| 113 |
+
similarities = cosine_similarity(vectors, [query_vector]).flatten()
|
| 114 |
+
|
| 115 |
+
# μ μ¬λ κΈ°λ° μ λ ¬
|
| 116 |
+
sorted_idx = np.argsort(similarities)[::-1][:top_k]
|
| 117 |
+
|
| 118 |
+
# κ²°κ³Ό μμ±
|
| 119 |
+
results = []
|
| 120 |
+
for idx in sorted_idx:
|
| 121 |
+
results.append({
|
| 122 |
+
"filename": metadata[idx]["filename"],
|
| 123 |
+
"total_pages": metadata[idx]["total_pages"],
|
| 124 |
+
"summary": metadata[idx]["summary"],
|
| 125 |
+
"timestamp": metadata[idx]["timestamp"],
|
| 126 |
+
"similarity": float(similarities[idx])
|
| 127 |
+
})
|
| 128 |
+
|
| 129 |
+
return results
|
| 130 |
+
|
| 131 |
+
except Exception as e:
|
| 132 |
+
raise Exception(f"κ²μ μ€ μ€λ₯ λ°μ: {str(e)}")
|
| 133 |
+
|
| 134 |
+
def _save_metadata(self) -> None:
|
| 135 |
+
"""λ©νλ°μ΄ν° μ μ₯"""
|
| 136 |
+
try:
|
| 137 |
+
Path(self.metadata_path).parent.mkdir(parents=True, exist_ok=True)
|
| 138 |
+
with open(self.metadata_path, 'w', encoding='utf-8') as f:
|
| 139 |
+
json.dump({
|
| 140 |
+
"documents": self.documents,
|
| 141 |
+
"metadata": self.metadata
|
| 142 |
+
}, f, ensure_ascii=False, indent=2)
|
| 143 |
+
except Exception as e:
|
| 144 |
+
raise Exception(f"λ©νλ°μ΄ν° μ μ₯ μ€ μ€λ₯ λ°μ: {str(e)}")
|
| 145 |
+
|
| 146 |
+
def _load_metadata(self):
|
| 147 |
+
"""λ©νλ°μ΄ν° λ‘λ"""
|
| 148 |
+
try:
|
| 149 |
+
if Path(self.metadata_path).exists():
|
| 150 |
+
with open(self.metadata_path, 'r', encoding='utf-8') as f:
|
| 151 |
+
data = json.load(f)
|
| 152 |
+
self.documents = data["documents"]
|
| 153 |
+
self.metadata = data["metadata"]
|
| 154 |
+
except Exception as e:
|
| 155 |
+
raise Exception(f"λ©νλ°μ΄ν° λ‘λ μ€ μ€λ₯ λ°μ: {str(e)}")
|
| 156 |
+
|
| 157 |
+
def load(self) -> None:
|
| 158 |
+
"""μ μ₯λ λ©νλ°μ΄ν° λΆλ¬μ€κΈ°"""
|
| 159 |
+
self._load_metadata()
|
| 160 |
+
|
| 161 |
+
# μ±κΈν€ μΈμ€ν΄μ€ μμ±
|
| 162 |
+
vector_store = VectorStore()
|