cryogenic22 commited on
Commit
dd96f4e
·
verified ·
1 Parent(s): 94dc3dd

Create utils/document_search.py

Browse files
Files changed (1) hide show
  1. utils/document_search.py +89 -0
utils/document_search.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ #utils/document_search.py
3
+ from typing import List, Dict, Optional
4
+ import sqlite3
5
+ from fuzzywuzzy import fuzz
6
+ import streamlit as st
7
+ from datetime import datetime
8
+
9
+ def search_documents(
10
+ conn: sqlite3.Connection,
11
+ query: str,
12
+ collection_id: Optional[int] = None,
13
+ filters: Optional[Dict] = None
14
+ ) -> List[Dict]:
15
+ """
16
+ Search documents using fuzzy matching and filters.
17
+
18
+ Args:
19
+ conn: Database connection
20
+ query: Search query
21
+ collection_id: Optional collection filter
22
+ filters: Optional dictionary of filters (date range, file type, etc.)
23
+ """
24
+ try:
25
+ cursor = conn.cursor()
26
+
27
+ # Base query
28
+ sql = """
29
+ SELECT DISTINCT
30
+ d.id,
31
+ d.name,
32
+ d.content,
33
+ d.upload_date,
34
+ GROUP_CONCAT(c.name) as collections
35
+ FROM documents d
36
+ LEFT JOIN document_collections dc ON d.id = dc.document_id
37
+ LEFT JOIN collections c ON dc.collection_id = c.id
38
+ """
39
+
40
+ params = []
41
+ where_clauses = []
42
+
43
+ # Add collection filter if specified
44
+ if collection_id:
45
+ where_clauses.append("dc.collection_id = ?")
46
+ params.append(collection_id)
47
+
48
+ # Add date filters if specified
49
+ if filters and 'date_range' in filters:
50
+ start_date, end_date = filters['date_range']
51
+ where_clauses.append("d.upload_date BETWEEN ? AND ?")
52
+ params.extend([start_date, end_date])
53
+
54
+ # Combine WHERE clauses
55
+ if where_clauses:
56
+ sql += " WHERE " + " AND ".join(where_clauses)
57
+
58
+ sql += " GROUP BY d.id"
59
+
60
+ # Execute query
61
+ cursor.execute(sql, params)
62
+
63
+ # Process results with fuzzy matching
64
+ documents = []
65
+ for row in cursor.fetchall():
66
+ # Calculate match score
67
+ name_score = fuzz.partial_ratio(query.lower(), row[1].lower())
68
+ content_score = fuzz.partial_ratio(query.lower(), row[2].lower()[:1000]) # Limit content search
69
+
70
+ # Use maximum score between name and content
71
+ match_score = max(name_score, content_score)
72
+
73
+ if match_score > 60: # Threshold for matches
74
+ documents.append({
75
+ 'id': row[0],
76
+ 'name': row[1],
77
+ 'content': row[2],
78
+ 'upload_date': row[3],
79
+ 'collections': row[4].split(',') if row[4] else [],
80
+ 'match_score': match_score
81
+ })
82
+
83
+ # Sort by match score
84
+ documents.sort(key=lambda x: x['match_score'], reverse=True)
85
+ return documents
86
+
87
+ except sqlite3.Error as e:
88
+ st.error(f"Error searching documents: {e}")
89
+ return []