Spaces:

Hopsakee
/

fabric_to_espanso

Sleeping

App Files Files Community

fabric_to_espanso / src /fabrics_processor /deduplicator.py

Hopsakee

Upload folder using huggingface_hub

5fe3652 verified 9 months ago

raw

history blame contribute delete

6.75 kB

	"""Deduplication module for fabric-to-espanso."""
	import logging
	from typing import List, Dict, Any, Tuple, Set
	import difflib
	from qdrant_client import QdrantClient
	from qdrant_client.http.models import Filter, PointIdsList

	from .config import config
	from .database import get_dense_vector_name, get_sparse_vector_name

	logger = logging.getLogger('fabric_to_espanso')

	def calculate_text_difference_percentage(text1: str, text2: str) -> float:
	"""
	Calculate the percentage difference between two text strings.

	Args:
	text1: First text string
	text2: Second text string

	Returns:
	Percentage difference as a float between 0.0 (identical) and 1.0 (completely different)
	"""
	# Use difflib's SequenceMatcher to calculate similarity ratio
	similarity = difflib.SequenceMatcher(None, text1, text2).ratio()
	# Convert similarity to difference percentage
	difference_percentage = 1.0 - similarity
	return difference_percentage

	# TODO: Consider moving the vector similarity search functionality to database_query.py and import it here
	# This would create a more structured codebase with search functionality centralized in one place
	def find_duplicates(client: QdrantClient, collection_name: str = config.embedding.collection_name) -> List[Tuple[str, List[str]]]:
	"""
	Find duplicate entries in the database based on semantic similarity and text difference.

	Args:
	client: Initialized Qdrant client
	collection_name: Name of the collection to query

	Returns:
	List of tuples containing (kept_point_id, [duplicate_point_ids])
	"""
	# Constants for duplicate detection
	SIMILARITY_THRESHOLD = 0.85 # Minimum semantic similarity to consider as potential duplicate
	DIFFERENCE_THRESHOLD = 0.1 # Maximum text difference (10%) to consider as duplicate
	# Get all points from the database
	all_points = client.scroll(
	collection_name=collection_name,
	with_vectors=True, # Include vector data, else no vector will be available
	limit=10000 # Adjust based on expected file count
	)[0]

	logger.info(f"Checking {len(all_points)} entries for duplicates")

	# Track processed points to avoid redundant comparisons
	processed_points = set()
	# Store duplicates as (kept_id, [duplicate_ids])
	duplicates = []

	# For each point, find semantically similar points
	for i, point in enumerate(all_points):
	if point.id in processed_points:
	continue

	point_id = point.id
	point_content = point.payload.get('content', '')
	logger.debug(f"Checking point {point_id} for duplicates")
	logger.debug(f"Content: {point_content}")

	# Skip if no content
	if not point_content:
	logger.debug(f"Skipping point {point_id} as it has no content")
	continue

	# Get the actual vector names from the collection configuration
	dense_vector_name = get_dense_vector_name(client, collection_name)

	# Skip points without vector or without the required vector type
	if not point.vector or dense_vector_name not in point.vector:
	logger.debug(f"Skipping point {point_id} as it has no valid vector")
	continue

	# Find semantically similar points using Qdrant's search
	similar_points = client.search(
	collection_name=collection_name,
	query_vector=(dense_vector_name, point.vector.get(dense_vector_name)),
	limit=100,
	score_threshold=SIMILARITY_THRESHOLD # Only consider points with similarity > threshold
	)

	# Skip the first result (which is the point itself)
	similar_points = [p for p in similar_points if p.id != point_id]

	if not similar_points:
	continue

	logger.debug(f"Found {len(similar_points)} semantically similar points for {point.payload.get('filename', 'unknown')}")

	# Check text difference for each similar point
	duplicate_ids = []
	for similar_point in similar_points:
	similar_id = similar_point.id

	# Skip if already processed
	if similar_id in processed_points:
	continue

	# Get content of similar point
	similar_content = None
	for p in all_points:
	if p.id == similar_id:
	similar_content = p.payload.get('content', '')
	break

	if not similar_content:
	continue

	# Calculate text difference percentage
	diff_percentage = calculate_text_difference_percentage(point_content, similar_content)

	# If difference is less than threshold, consider it a duplicate
	if diff_percentage <= DIFFERENCE_THRESHOLD:
	duplicate_ids.append(similar_id)
	processed_points.add(similar_id)
	logger.debug(f"Found duplicate: {similar_id} (diff: {diff_percentage:.2%})")

	if duplicate_ids:
	duplicates.append((point_id, duplicate_ids))
	processed_points.add(point_id)

	logger.info(f"Found {sum(len(dups) for _, dups in duplicates)} duplicate entries in {len(duplicates)} groups")
	return duplicates

	def remove_duplicates(client: QdrantClient, collection_name: str = config.embedding.collection_name) -> int:
	"""
	Remove duplicate entries from the database based on semantic similarity and text difference.
	Uses a two-step verification process:
	1. Find entries with semantic similarity > 0.9 (using vector search)
	2. For those entries, keep only those with text difference <= 5%

	Args:
	client: Initialized Qdrant client
	collection_name: Name of the collection to query

	Returns:
	Number of removed duplicate entries
	"""
	# Find duplicates
	duplicate_groups = find_duplicates(client, collection_name)

	if not duplicate_groups:
	logger.info("No duplicates found")
	return 0

	# Count total duplicates
	total_duplicates = sum(len(dups) for _, dups in duplicate_groups)

	# Remove duplicates
	for _, duplicate_ids in duplicate_groups:
	if duplicate_ids:
	client.delete(
	collection_name=collection_name,
	points_selector=PointIdsList(points=duplicate_ids)
	)

	logger.info(f"Removed {total_duplicates} duplicate entries from the database")
	return total_duplicates