| import { limitWords, mapLanguageCodeToPostgresConfig } from "@midday/documents"; |
| import { DocumentClassifier } from "@midday/documents/classifier"; |
| import { triggerJob } from "@midday/job-client"; |
| import type { Job } from "bullmq"; |
| import type { ClassifyDocumentPayload } from "../../schemas/documents"; |
| import { getDb } from "../../utils/db"; |
| import { updateDocumentWithRetry } from "../../utils/document-update"; |
| import { TIMEOUTS, withTimeout } from "../../utils/timeout"; |
| import { BaseProcessor } from "../base"; |
|
|
| |
| |
| |
| interface ClassificationResult { |
| title: string | null; |
| summary: string | null; |
| date: string | null; |
| language: string | null; |
| tags: string[] | null; |
| } |
|
|
| |
| |
| |
| |
| |
| export class ClassifyDocumentProcessor extends BaseProcessor<ClassifyDocumentPayload> { |
| async process(job: Job<ClassifyDocumentPayload>): Promise<void> { |
| const { content, fileName, teamId } = job.data; |
| const db = getDb(); |
|
|
| |
| |
| const pathTokens = fileName.split("/"); |
|
|
| this.logger.info("Classifying document", { |
| fileName, |
| pathTokens, |
| teamId, |
| contentLength: content.length, |
| }); |
|
|
| |
| let classificationResult: ClassificationResult | null = null; |
| let classificationFailed = false; |
|
|
| try { |
| const classifier = new DocumentClassifier(); |
| classificationResult = await withTimeout( |
| classifier.classifyDocument({ content }), |
| TIMEOUTS.AI_CLASSIFICATION, |
| `Document classification timed out after ${TIMEOUTS.AI_CLASSIFICATION}ms`, |
| ); |
| } catch (error) { |
| |
| classificationFailed = true; |
| this.logger.warn("AI classification failed, completing with fallback", { |
| fileName, |
| teamId, |
| error: error instanceof Error ? error.message : "Unknown error", |
| errorType: error instanceof Error ? error.name : "Unknown", |
| contentLength: content.length, |
| }); |
| } |
|
|
| |
| let finalTitle: string | null = null; |
|
|
| if ( |
| classificationResult?.title && |
| classificationResult.title.trim().length > 0 |
| ) { |
| finalTitle = classificationResult.title; |
| } else if (classificationResult && !classificationFailed) { |
| |
| this.logger.warn( |
| "Classification returned null or empty title - generating fallback", |
| { |
| fileName, |
| pathTokens, |
| teamId, |
| hasSummary: !!classificationResult.summary, |
| hasDate: !!classificationResult.date, |
| contentLength: content.length, |
| }, |
| ); |
|
|
| |
| const fileNameWithoutExt = |
| fileName |
| .split("/") |
| .pop() |
| ?.replace(/\.[^/.]+$/, "") || "Document"; |
| const datePart = classificationResult.date |
| ? ` - ${classificationResult.date}` |
| : ""; |
| const summaryPart = classificationResult.summary |
| ? ` - ${classificationResult.summary.substring(0, 50)}${classificationResult.summary.length > 50 ? "..." : ""}` |
| : ""; |
|
|
| |
| const contentSample = content.substring(0, 200).toLowerCase(); |
| let inferredType = "Document"; |
| if (contentSample.includes("invoice") || contentSample.includes("inv")) { |
| inferredType = "Invoice"; |
| } else if (contentSample.includes("receipt")) { |
| inferredType = "Receipt"; |
| } else if ( |
| contentSample.includes("contract") || |
| contentSample.includes("agreement") |
| ) { |
| inferredType = "Contract"; |
| } else if (contentSample.includes("report")) { |
| inferredType = "Report"; |
| } |
|
|
| finalTitle = `${inferredType}${summaryPart || ` - ${fileNameWithoutExt}`}${datePart}`; |
|
|
| this.logger.info("Generated fallback title", { |
| fileName, |
| generatedTitle: finalTitle, |
| }); |
| } |
| |
|
|
| |
| |
| const updatedDocs = await updateDocumentWithRetry( |
| db, |
| { |
| pathTokens, |
| teamId, |
| title: finalTitle ?? undefined, |
| summary: classificationResult?.summary ?? undefined, |
| content: limitWords(content, 10000), |
| date: classificationResult?.date ?? undefined, |
| language: mapLanguageCodeToPostgresConfig( |
| classificationResult?.language, |
| ), |
| |
| processingStatus: "completed", |
| }, |
| this.logger, |
| ); |
|
|
| if (!updatedDocs || updatedDocs.length === 0) { |
| this.logger.error("Document not found for classification update", { |
| fileName, |
| pathTokens, |
| teamId, |
| }); |
| throw new Error(`Document with path ${fileName} not found`); |
| } |
|
|
| const data = updatedDocs[0]; |
| if (!data || !data.id) { |
| throw new Error( |
| `Document update returned invalid data for path ${fileName}`, |
| ); |
| } |
|
|
| |
| if (classificationResult?.tags && classificationResult.tags.length > 0) { |
| this.logger.info("Triggering document tag embedding", { |
| documentId: data.id, |
| tagsCount: classificationResult.tags.length, |
| }); |
|
|
| |
| await triggerJob( |
| "embed-document-tags", |
| { |
| documentId: data.id, |
| tags: classificationResult.tags, |
| teamId, |
| }, |
| "documents", |
| { jobId: `embed-tags_${teamId}_${data.id}` }, |
| ); |
| } else { |
| this.logger.info("Document processing completed", { |
| documentId: data.id, |
| classificationFailed, |
| hasTitle: !!finalTitle, |
| }); |
| } |
| } |
| } |
|
|