Spaces:

0edon
/

test

Paused

App Files Files Community

christopher commited on May 6, 2025

Commit

c8d57fb

1 Parent(s): 1db196f

changed nlp and query processors to fix issues with lists

Browse files

Files changed (2) hide show

database/query_processor.py +59 -44
models/nlp.py +5 -9

database/query_processor.py CHANGED Viewed

@@ -2,6 +2,9 @@ import datetime
 from typing import List, Dict, Any, Optional
 import numpy as np
 from models.LexRank import degree_centrality_scores
 class QueryProcessor:
     def __init__(self, embedding_model, summarization_model, nlp_model, db_service):
@@ -17,51 +20,63 @@ class QueryProcessor:
         start_date: Optional[str] = None,
         end_date: Optional[str] = None
     ) -> Dict[str, Any]:
-        # Convert string dates to datetime objects
-        start_dt = datetime.strptime(start_date, "%Y-%m-%d") if start_date else None
-        end_dt = datetime.strptime(end_date, "%Y-%m-%d") if end_date else None
-        # Get query embedding
-        query_embedding = self.embedding_model.encode(query).tolist()
-        # Get entities from the query
-        doc = self.nlp_model(query)
-        entities = [ent.text.lower() for ent in doc.ents]  # Extract entity texts
-        # Semantic search with entities
-        articles = await self.db_service.semantic_search(
-            query_embedding=query_embedding,
-            start_date=start_dt,
-            end_date=end_dt,
-            topic=topic,
-            entities=entities  # Pass entities to the search
-        )
-        if not articles:
-            return {"error": "No articles found matching the criteria"}
-        # Step 3: Process results
-        contents = [article["content"] for article in articles]
-        sentences = []
-        for content in contents:
-            sentences.extend(self.nlp_model.tokenize_sentences(content))
-        # Step 4: Generate summary
-        if sentences:
-            embeddings = self.embedding_model.encode(sentences)
-            similarity_matrix = np.inner(embeddings, embeddings)
-            centrality_scores = degree_centrality_scores(similarity_matrix, threshold=None)
-            top_indices = np.argsort(-centrality_scores)[0:10]
-            key_sentences = [sentences[idx].strip() for idx in top_indices]
-            combined_text = ' '.join(key_sentences)
-            summary = self.summarization_model.summarize(combined_text)
-        else:
-            key_sentences = []
-            summary = "No content available for summarization"
-        return {
-            "summary": summary,
-            "articles": articles
-        }

 from typing import List, Dict, Any, Optional
 import numpy as np
 from models.LexRank import degree_centrality_scores
+import logging
+logger = logging.getLogger(__name__)
 class QueryProcessor:
     def __init__(self, embedding_model, summarization_model, nlp_model, db_service):
         start_date: Optional[str] = None,
         end_date: Optional[str] = None
     ) -> Dict[str, Any]:
+        try:
+            # Convert string dates to datetime objects
+            start_dt = datetime.strptime(start_date, "%Y-%m-%d") if start_date else None
+            end_dt = datetime.strptime(end_date, "%Y-%m-%d") if end_date else None
+            # Get query embedding
+            query_embedding = self.embedding_model.encode(query).tolist()
+            logger.debug(f"Generated query embedding for: {query[:50]}...")
+            # Extract entities using the NLP model
+            entities = self.nlp_model.extract_entities(query)  # Changed from direct call to using method
+            logger.debug(f"Extracted entities: {entities}")
+            # Semantic search with entities
+            articles = await self.db_service.semantic_search(
+                query_embedding=query_embedding,
+                start_date=start_dt,
+                end_date=end_dt,
+                topic=topic,
+                entities=[ent[0] for ent in entities]  # Using just the entity texts
+            )
+            if not articles:
+                logger.info("No articles found matching search criteria")
+                return {"error": "No articles found matching the criteria"}
+            # Process results
+            contents = [article["content"] for article in articles]
+            sentences = []
+            for content in contents:
+                sentences.extend(self.nlp_model.tokenize_sentences(content))
+            logger.debug(f"Processing {len(sentences)} sentences for summarization")
+            # Generate summary
+            if sentences:
+                embeddings = self.embedding_model.encode(sentences)
+                similarity_matrix = np.inner(embeddings, embeddings)
+                centrality_scores = degree_centrality_scores(similarity_matrix, threshold=None)
+                top_indices = np.argsort(-centrality_scores)[0:10]
+                key_sentences = [sentences[idx].strip() for idx in top_indices]
+                combined_text = ' '.join(key_sentences)
+                summary = self.summarization_model.summarize(combined_text)
+                logger.debug(f"Generated summary with {len(key_sentences)} key sentences")
+            else:
+                key_sentences = []
+                summary = "No content available for summarization"
+                logger.warning("No sentences available for summarization")
+            return {
+                "summary": summary,
+                "articles": articles,
+                "entities": entities  # Include extracted entities in response
+            }
+        except Exception as e:
+            logger.error(f"Error in QueryProcessor: {str(e)}", exc_info=True)
+            return {"error": f"Processing error: {str(e)}"}

models/nlp.py CHANGED Viewed

@@ -11,15 +11,11 @@ class NLPModel:
         return self.extract_entities(text)  # or another default method
     def extract_entities(self, text: str):
-        if isinstance(text, list):  # If input is a list of sentences
-            entities = []
-            for sentence in text:
-                doc = self.nlp(sentence)
-                entities.extend([(ent.text.lower(), ent.label_) for ent in doc.ents])
-            return entities
-        else:  # If input is a single string
-            doc = self.nlp(text)
-            return [(ent.text.lower(), ent.label_) for ent in doc.ents]
     def tokenize_sentences(self, text: str):

         return self.extract_entities(text)  # or another default method
     def extract_entities(self, text: str):
+        """Ensure this always takes a string and returns entities"""
+        if isinstance(text, list):  # If accidentally passed a list
+            text = " ".join(text)  # Combine into single string
+        doc = self.nlp(text)
+        return [(ent.text.lower(), ent.label_) for ent in doc.ents]
     def tokenize_sentences(self, text: str):