Spaces:

0edon
/

test

Paused

App Files Files Community

christopher commited on May 6, 2025

Commit

0424ce2

1 Parent(s): 21f3f8a

reverted query processor

Browse files

Files changed (1) hide show

database/query_processor.py +22 -16

database/query_processor.py CHANGED Viewed

@@ -29,28 +29,22 @@ class QueryProcessor:
             # Query processing
             query_embedding = self.embedding_model.encode(query).tolist()
-            logger.debug(f"Generated embedding for query: {query[:50]}...")
-            # Entity extraction
             entities = self.nlp_model.extract_entities(query)
-            logger.debug(f"Extracted entities: {entities}")
             # Database search
-            articles = await self._execute_semantic_search(
                 query_embedding,
                 start_dt,
                 end_dt,
                 topic,
-                [ent[0] for ent in entities]  # Just the entity texts
             )
             if not articles:
-                logger.info("No articles found matching criteria")
                 return {"message": "No articles found", "articles": []}
             # Summary generation
             summary_data = self._generate_summary(articles)
             return {
                 "summary": summary_data["summary"],
                 "key_sentences": summary_data["key_sentences"],
@@ -70,22 +64,34 @@ class QueryProcessor:
             logger.error(f"Invalid date format: {date_str}")
             raise ValueError(f"Invalid date format. Expected YYYY-MM-DD, got {date_str}")
     async def _execute_semantic_search(
         self,
         query_embedding: List[float],
         start_date: Optional[dt],
         end_date: Optional[dt],
         topic: Optional[str],
-        entities: List[str]
     ) -> List[Dict[str, Any]]:
         """Execute search with proper error handling"""
         try:
             return await self.db_service.semantic_search(
                 query_embedding=query_embedding,
                 start_date=start_date,
                 end_date=end_date,
                 topic=topic,
-                entities=entities
             )
         except Exception as e:
             logger.error(f"Semantic search failed: {str(e)}")
@@ -94,10 +100,11 @@ class QueryProcessor:
     def _generate_summary(self, articles: List[Dict[str, Any]]) -> Dict[str, Any]:
         """Generate summary from articles with fallback handling"""
         try:
-            # Extract and process content
             sentences = []
-            for article in articles:
-                if content := article.get("content"):
                     sentences.extend(self.nlp_model.tokenize_sentences(content))
             if not sentences:
@@ -107,17 +114,16 @@ class QueryProcessor:
                     "key_sentences": []
                 }
-            # Generate summary
             embeddings = self.embedding_model.encode(sentences)
             similarity_matrix = np.inner(embeddings, embeddings)
             centrality_scores = degree_centrality_scores(similarity_matrix, threshold=None)
-            # Get top 10 most central sentences
             top_indices = np.argsort(-centrality_scores)[:10]
             key_sentences = [sentences[idx].strip() for idx in top_indices]
             return {
-                "summary": self.summarization_model.summarize(' '.join(key_sentences)),
                 "key_sentences": key_sentences
             }

             # Query processing
             query_embedding = self.embedding_model.encode(query).tolist()
             entities = self.nlp_model.extract_entities(query)
             # Database search
+            articles = await self._execute_search(
                 query_embedding,
                 start_dt,
                 end_dt,
                 topic,
+                [ent[0] for ent in entities]
             )
             if not articles:
                 return {"message": "No articles found", "articles": []}
             # Summary generation
             summary_data = self._generate_summary(articles)
             return {
                 "summary": summary_data["summary"],
                 "key_sentences": summary_data["key_sentences"],
             logger.error(f"Invalid date format: {date_str}")
             raise ValueError(f"Invalid date format. Expected YYYY-MM-DD, got {date_str}")
+    def _extract_entities_safely(self, text: str) -> List[Tuple[str, str]]:
+        """Robust entity extraction handling both strings and lists"""
+        try:
+            if isinstance(text, list):
+                logger.warning("Received list input for entity extraction, joining to string")
+                text = " ".join(text)
+            return self.nlp_model.extract_entities(text)
+        except Exception as e:
+            logger.error(f"Entity extraction failed: {str(e)}")
+            return []
     async def _execute_semantic_search(
         self,
         query_embedding: List[float],
         start_date: Optional[dt],
         end_date: Optional[dt],
         topic: Optional[str],
+        entities: List[Tuple[str, str]]
     ) -> List[Dict[str, Any]]:
         """Execute search with proper error handling"""
         try:
+            entity_texts = [ent[0] for ent in entities]
             return await self.db_service.semantic_search(
                 query_embedding=query_embedding,
                 start_date=start_date,
                 end_date=end_date,
                 topic=topic,
+                entities=entity_texts
             )
         except Exception as e:
             logger.error(f"Semantic search failed: {str(e)}")
     def _generate_summary(self, articles: List[Dict[str, Any]]) -> Dict[str, Any]:
         """Generate summary from articles with fallback handling"""
         try:
+            contents = [article["content"] for article in articles]
             sentences = []
+            for content in contents:
+                if content:
                     sentences.extend(self.nlp_model.tokenize_sentences(content))
             if not sentences:
                     "key_sentences": []
                 }
             embeddings = self.embedding_model.encode(sentences)
             similarity_matrix = np.inner(embeddings, embeddings)
             centrality_scores = degree_centrality_scores(similarity_matrix, threshold=None)
             top_indices = np.argsort(-centrality_scores)[:10]
             key_sentences = [sentences[idx].strip() for idx in top_indices]
+            combined_text = ' '.join(key_sentences)
             return {
+                "summary": self.summarization_model.summarize(combined_text),
                 "key_sentences": key_sentences
             }