Spaces:

salverz
/

llm-document-parser

Sleeping

Update file structure

4b3616f 8 months ago

1.66 kB

	# config.py
	from pydantic import BaseModel
	from datetime import date
	from typing import List

	# Options: "rapid", "easy", "ocrmac", "tesseract"
	OCR_MODEL = "easy"

	# Must be set when using the tesseract OCR model
	# Linux: "/usr/share/tesseract-ocr/4.00/tessdata"
	# Windows: "C:\\Program Files\\Tesseract-OCR\\tessdata"
	# Mac: "/usr/local/share/tessdata" or "/opt/homebrew/share/tessdata"
	TESSERACT_TESSDATA_LOCATION = "/usr/share/tesseract-ocr/4.00/tessdata"

	OLLAMA_MODEL = "llama3:instruct"

	LLM_PROMPT = """
	Extract all transactions from the following statement. Each transaction must be returned as a JSON object with the fields: transaction_date (YYYY-MM-DD), description, amount, and transaction_type ('deposit' or 'withdrawal'). All of these must be returned as a list of JSON objects under a key called 'transactions'. Here is an example:
	[
	{
	transaction_date: 2025-01-24,
	description: "Walmart",
	amount: 34.24,
	transaction_type: "withdrawl"
	}
	]
	"""

	# Options: "csv", "json", "excel"
	EXPORT_TYPE = "json"

	# Can be a file or directory
	INPUT_PATH = ""
	OUTPUT_FOLDER = ""
	OUTPUT_FILE_NAME = "output"

	# Define Pydantic response models for instructor:

	class BankStatementEntry(BaseModel):
	transaction_date: date \| None \| str
	description: str \| None
	amount: float \| None
	#transaction_type: Literal['deposit', 'withdrawal', None]
	transaction_type: str \| None

	class BankStatement(BaseModel):
	transactions: List[BankStatementEntry] \| None

	# The model that LLM output will conform to
	RESPONSE_MODEL = BankStatement