Spaces:

PopovDanil
/

backend

Sleeping

backend / app /core /chunks.py

try 1

d412733 9 months ago

1.56 kB

	import uuid


	class Chunk:
	"""
	id -> unique number in uuid format, can be tried https://www.uuidgenerator.net/
	start_index -> the index of the first char from the beginning of the original document

	TODO: implement access modifiers and set of getters and setters
	"""

	def __init__(
	self,
	id: uuid.UUID,
	filename: str,
	page_number: int,
	start_index: int,
	start_line: int,
	end_line: int,
	text: str,
	):
	self.id: uuid.UUID = id
	self.filename: str = filename
	self.page_number: int = page_number
	self.start_index: int = start_index
	self.start_line: int = start_line
	self.end_line: int = end_line
	self.text: str = text

	def get_raw_text(self) -> str:
	return self.text

	def get_splitted_text(self) -> list[str]:
	return self.text.split(" ")

	def get_metadata(self) -> dict:
	return {
	"id": str(self.id),
	"filename": self.filename,
	"page_number": self.page_number,
	"start_index": self.start_index,
	"start_line": self.start_line,
	"end_line": self.end_line,
	}

	# TODO: remove kostyly
	def __str__(self):
	return (
	f"Chunk from {self.filename.split('/')[-1]}, "
	f"page - {self.page_number}, "
	f"start - {self.start_line}, "
	f"end - {self.end_line}, "
	f"and text - {self.text[:100]}... ({len(self.text)})...{self.text[-20:]}\n"
	)