Spaces:

InnoTrack
/

Graduation_Project-v1.2

Running

App Files Files Community

bat-6 commited on 17 days ago

Commit

eecb5f7

1 Parent(s): cf51ebc

feat: implement semantic search functionality and engine for project similarity analysis

Browse files

Files changed (2) hide show

src/similarity_model/semantic_search.py +26 -3
src/similarity_model/similarity_engine.py +44 -23

src/similarity_model/semantic_search.py CHANGED Viewed

@@ -87,9 +87,32 @@ def load_metadata():
     mtime = os.path.getmtime(INDEX_PATH)
     if _cached_metadata is None or _cached_metadata_mtime != mtime:
-        logger.info(f"Loading metadata from Azure SQL (syncing with FAISS index mtime: {mtime})...")
-        df = load_preprocessed_projects()
-        _cached_metadata = df.reset_index(drop=True)
         _cached_metadata_mtime = mtime
     return _cached_metadata

     mtime = os.path.getmtime(INDEX_PATH)
     if _cached_metadata is None or _cached_metadata_mtime != mtime:
+        if META_PATH.exists():
+            logger.info(f"Loading metadata from local parquet {META_PATH} (syncing with FAISS index mtime: {mtime})...")
+            try:
+                df = pd.read_parquet(str(META_PATH))
+                if "features" in df.columns:
+                    import json
+                    def parse_features(x):
+                        if not isinstance(x, str):
+                            return x
+                        try:
+                            x = json.loads(x)
+                            if isinstance(x, str):
+                                x = json.loads(x)
+                            return x
+                        except Exception:
+                            return []
+                    df["features"] = df["features"].apply(parse_features)
+                _cached_metadata = df.reset_index(drop=True)
+            except Exception as e:
+                logger.warning(f"Failed to read local metadata.parquet: {e}. Falling back to database query...")
+                df = load_preprocessed_projects()
+                _cached_metadata = df.reset_index(drop=True)
+        else:
+            logger.info(f"Loading metadata from Azure SQL (syncing with FAISS index mtime: {mtime})...")
+            df = load_preprocessed_projects()
+            _cached_metadata = df.reset_index(drop=True)
         _cached_metadata_mtime = mtime
     return _cached_metadata

src/similarity_model/similarity_engine.py CHANGED Viewed

@@ -291,7 +291,44 @@ def find_similar_projects(
     )
     if not exclude_self:
-        # Check database table 'preprocess' directly to see if this project title already exists
         db_match_found = False
         db_project_id = None
         db_features = []
@@ -325,29 +362,13 @@ def find_similar_projects(
                     db_features = []
         except Exception as db_exc:
             logger.warning(f"Direct database check of 'preprocess' table failed: {db_exc}")
-        # Check local/cached metadata dataframe as fallback
-        existing_project = df[
-            df["project_title"].str.lower().str.strip()
-            ==
-            title.lower().strip()
-        ]
-        if len(existing_project) > 0 or db_match_found:
-            logger.info("Exact title match found in preprocess table — returning originality = 0")
-            if len(existing_project) > 0:
-                matched_row = existing_project.iloc[0]
-                project_id = int(matched_row.name)
-                stored_features = safe_feature_list(matched_row["features"])
-                matched_title = matched_row["project_title"]
-                matched_abstract = matched_row.get("abstract", "")
-                matched_desc = matched_row.get("description", "")
-            else:
-                project_id = int(db_project_id)
-                stored_features = db_features
-                matched_abstract = db_abstract
-                matched_desc = db_description
             return pd.DataFrame([{
                 "project_id":           project_id,

     )
     if not exclude_self:
+        # Check local/cached metadata dataframe first (extremely fast, zero DB/network overhead)
+        existing_project = df[
+            df["project_title"].str.lower().str.strip()
+            ==
+            title.lower().strip()
+        ]
+        if len(existing_project) > 0:
+            logger.info("Exact title match found in local metadata cache — returning originality = 0")
+            matched_row = existing_project.iloc[0]
+            project_id = int(matched_row.name)
+            stored_features = safe_feature_list(matched_row["features"])
+            matched_title = matched_row["project_title"]
+            matched_abstract = matched_row.get("abstract", "")
+            matched_desc = matched_row.get("description", "")
+            return pd.DataFrame([{
+                "project_id":           project_id,
+                "project_title":        matched_title,
+                "semantic_score":       1.0,
+                "feature_score":        1.0,
+                "coverage":             1.0,
+                "hybrid_score":         1.0,
+                "originality_score":    0.0,
+                "confidence_score":     1.0,
+                "duplicate_risk":       "Very High",
+                "shared_features_count": len(stored_features),
+                "matched_features":     [{"feature_a": f, "feature_b": f, "score": 1.0} for f in stored_features],
+                "unique_query_features": [],
+                "unique_candidate_features": [],
+                "query_features_used":  stored_features,
+                "query_clean_text":     title,
+                "candidate_features":   stored_features,
+                "abstract":             matched_abstract,
+                "description":          matched_desc
+            }])
+        # If not found in cache, check database table 'preprocess' directly
         db_match_found = False
         db_project_id = None
         db_features = []
                     db_features = []
         except Exception as db_exc:
             logger.warning(f"Direct database check of 'preprocess' table failed: {db_exc}")
+        if db_match_found:
+            logger.info("Exact title match found in database 'preprocess' table — returning originality = 0")
+            project_id = int(db_project_id)
+            stored_features = db_features
+            matched_abstract = db_abstract
+            matched_desc = db_description
             return pd.DataFrame([{
                 "project_id":           project_id,