pdfcounter / app.py
MKE0108's picture
add
503a81b
from fastapi import FastAPI, File, UploadFile
from fastapi.responses import HTMLResponse
import pdfplumber
import re
from fastapi.templating import Jinja2Templates
from fastapi import Request
app = FastAPI()
# 設定 HTML 模板目錄
templates = Jinja2Templates(directory="templates")
# 定義篩選中文字符的正則表達式
chinese_char_pattern = re.compile(r'[\u4e00-\u9fff]')
# 定義篩選英文字符的正則表達式
english_char_pattern = re.compile(r'[A-Za-z]')
# 統計中文和英文字符數的函數
def count_chinese_and_english_characters_in_pdf(file_path):
total_chinese_chars = 0
total_english_chars = 0
page_counts = []
# 使用 pdfplumber 打開 PDF 文件
with pdfplumber.open(file_path) as pdf:
# 遍歷每一頁
for page_num, page in enumerate(pdf.pages, start=1):
# 提取頁面的文本
text = page.extract_text()
if text:
# 找到所有中文字符
chinese_chars = chinese_char_pattern.findall(text)
chinese_char_count = len(chinese_chars)
# 找到所有英文字符
english_chars = english_char_pattern.findall(text)
english_char_count = len(english_chars)
# 累加每頁的字符數到總數
total_chinese_chars += chinese_char_count
total_english_chars += english_char_count
# 保存每頁的字符數
page_counts.append({
"page_num": page_num,
"chinese_count": chinese_char_count,
"english_count": english_char_count
})
# 返回每頁的字數及總字數
return {
"page_counts": page_counts,
"total_chinese_chars": total_chinese_chars,
"total_english_chars": total_english_chars,
"total_chars": total_chinese_chars + total_english_chars
}
# 顯示上傳表單的首頁
@app.get("/", response_class=HTMLResponse)
async def show_form(request: Request):
return templates.TemplateResponse("upload_form.html", {"request": request})
# 上傳並處理文件的 API
@app.post("/uploadfile/")
async def upload_file(request: Request, file: UploadFile = File(...)):
if file.content_type == "application/pdf":
# 將上傳的文件存儲在本地
file_name = file.filename
with open(f"{file_name}", "wb") as f:
f.write(await file.read())
# 計算字符數
result = count_chinese_and_english_characters_in_pdf(file_name)
# 返回結果,並顯示在前端,包括檔案名稱
return templates.TemplateResponse("upload_form.html", {
"request": request,
"file_name": file_name,
"page_counts": result["page_counts"],
"total_chinese": result["total_chinese_chars"],
"total_english": result["total_english_chars"],
"total_chars": result["total_chars"]
})
else:
return {"error": "只接受 PDF 文件"}