Spaces:

42charlie
/

rag-visualizer

Sleeping

Ahmed Sadik

fix: correct argument order in get_document_text call for chunk context retrieval

e4662f9 26 days ago

4.35 kB

	from datetime import timedelta

	from fastapi import Request
	from services.storage import human_readable_size
	from config import PREVIEW_LENGTH

	#CREATE TABLE IF NOT EXISTS documents (
	# id TEXT PRIMARY KEY,
	# original_filename TEXT NOT NULL,
	# full_text TEXT NOT NULL,
	# text_length INTEGER NOT NULL,
	# file_hash TEXT UNIQUE NOT NULL,
	# chunks_count INTEGER NOT NULL,
	# pages_count INTEGER NOT NULL,
	# created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
	# last_activity_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
	# );

	async def get_pg_pool(request: Request):
	return request.app.state.pg_pool

	async def insert_document(document_id, original_filename, full_text, chunks_count, pages_count, file_hash, pg_pool):
	''' Insert a new document into the database and return its ID '''
	query = '''
	INSERT INTO documents (id, original_filename, full_text, text_length, chunks_count, pages_count, file_hash)
	VALUES ($1, $2, $3, $4, $5, $6, $7)
	'''
	async with pg_pool.acquire() as conn:
	await conn.execute(query, document_id, original_filename, full_text, len(full_text), chunks_count, pages_count, file_hash)
	return document_id

	async def get_document_by_uuid(document_id, pg_pool):
	''' Retrieve document metadata by its UUID '''
	async with pg_pool.acquire() as conn:
	query = 'SELECT id, original_filename, chunks_count, pages_count, text_length, created_at FROM documents WHERE id = $1'
	row = await conn.fetchrow(query, document_id)
	if row:
	return {
	"id": row['id'],
	"filename": row['original_filename'],
	"chunk_count": row['chunks_count'],
	"pages": row['pages_count'],
	"character_count": row['text_length'],
	"size": human_readable_size(row['text_length']),
	"created_at": row['created_at'].isoformat() # Convert Postgres datetime to string
	}
	return None

	async def document_exists(document_id, pg_pool):
	''' Check if a document with the given ID exists in the database '''
	async with pg_pool.acquire() as conn:
	row = await conn.fetchval('SELECT 1 FROM documents WHERE id = $1', document_id)
	return row is not None

	async def get_document_text(document_id, pg_pool, start_char=0, length=PREVIEW_LENGTH):
	''' Retrieve a substring of the document's full text based on character offsets '''
	async with pg_pool.acquire() as conn:
	query = 'SELECT SUBSTR(full_text, $1, $2) FROM documents WHERE id = $3'
	text = await conn.fetchval(query, start_char + 1, length, document_id)
	return text

	async def update_document_activity(document_id, pg_pool):
	''' Update the last activity timestamp for a document '''
	async with pg_pool.acquire() as conn:
	await conn.execute('UPDATE documents SET last_activity_at = CURRENT_TIMESTAMP WHERE id = $1', document_id)

	async def delete_inactive_documents(pg_pool, inactivity_threshold_hours=48):
	''' Delete documents that haven't been accessed within the specified inactivity threshold '''
	async with pg_pool.acquire() as conn:
	# Pass a standard Python timedelta, and asyncpg translates it to a Postgres INTERVAL
	threshold = timedelta(hours=inactivity_threshold_hours)
	query = '''
	DELETE FROM documents
	WHERE last_activity_at < CURRENT_TIMESTAMP - $1::interval
	RETURNING id
	'''
	records = await conn.fetch(query, threshold)

	return [record['id'] for record in records]

	async def get_document_by_hash(file_hash, pg_pool):
	''' Retrieve document metadata by its file hash '''
	async with pg_pool.acquire() as conn:
	query = 'SELECT id, original_filename, chunks_count, pages_count, text_length, created_at FROM documents WHERE file_hash = $1'
	row = await conn.fetchrow(query, file_hash)
	if row:
	return {
	"id": row['id'],
	"filename": row['original_filename'],
	"chunk_count": row['chunks_count'],
	"pages": row['pages_count'],
	"character_count": row['text_length'],
	"size": human_readable_size(row['text_length']),
	}
	return None