Final_Assignment_Template

Sleeping

App Files Files Community

FD900 commited on Jun 30, 2025

Commit

06f1955

verified ·

1 Parent(s): 5009282

Update tools/content_retriever_tool.py

Browse files

Files changed (1) hide show

tools/content_retriever_tool.py +40 -35

tools/content_retriever_tool.py CHANGED Viewed

@@ -1,58 +1,63 @@
-from smolagents import Tool
 from docling.document_converter import DocumentConverter
 from docling.chunking import HierarchicalChunker
 from sentence_transformers import SentenceTransformer, util
 import torch
-class ContentRetrievalTool(Tool):
-    name = 'content_retrieval'
-    description = """Extracts and summarizes relevant content from webpages or documents. Supports formats like PDF, DOCX, HTML, XLSX, etc."""
     inputs = {
         "url": {
             "type": "string",
-            "description": "The path or web link to the file or page to process."
         },
         "query": {
             "type": "string",
-            "description": "Main subject or keyword to retrieve from the content."
         },
     }
     output_type = "string"
-    def __init__(self, model_name: str = 'all-MiniLM-L6-v2', threshold: float = 0.2, **kwargs):
-        super().__init__(**kwargs)
         self.threshold = threshold
         self._converter = DocumentConverter()
         self._chunker = HierarchicalChunker()
-        self._model = SentenceTransformer(model_name)
     def forward(self, url: str, query: str) -> str:
-        document = self._converter.convert(url).document
-        if not document:
-            return "Failed to load content."
-        segments = list(self._chunker.chunk(document))
-        if not segments:
-            return "No content detected."
-        segment_texts = [seg.text for seg in segments]
-        segment_contexts = [self._chunker.contextualize(seg).replace(seg.text, "").strip() for seg in segments]
-        all_embeddings = [
-            self._model.encode(segment_texts, convert_to_tensor=True),
-            self._model.encode(segment_contexts, convert_to_tensor=True)
-        ]
-        query_emb = self._model.encode([s.strip() for s in query.split(',') if s.strip()], convert_to_tensor=True)
-        idx = set()
-        for emb in all_embeddings:
-            for similarity in util.pytorch_cos_sim(query_emb, emb):
-                probs = torch.nn.functional.softmax(similarity, dim=0)
-                for i in torch.argsort(probs, descending=True):
-                    idx.add(i.item())
-                    if probs[i] >= self.threshold:
                         break
-        selected = sorted(list(idx))
-        return '\n\n'.join([self._chunker.contextualize(segments[i]) for i in selected]) if selected else "No relevant info found."

+from tools.base import Tool
 from docling.document_converter import DocumentConverter
 from docling.chunking import HierarchicalChunker
 from sentence_transformers import SentenceTransformer, util
 import torch
+class ContentRetrieverTool(Tool):
+    name = 'retrieve_content'
+    description = "Extracts relevant content from a file or URL (PDF, DOCX, XLSX, HTML, etc.) based on a given query."
     inputs = {
         "url": {
             "type": "string",
+            "description": "The document URL or local path to load content from.",
         },
         "query": {
             "type": "string",
+            "description": "Query term(s) used to filter relevant content from the document.",
         },
     }
     output_type = "string"
+    def __init__(self, model_name: str = 'all-MiniLM-L6-v2', threshold: float = 0.2):
         self.threshold = threshold
         self._converter = DocumentConverter()
         self._chunker = HierarchicalChunker()
+        self._embedder = SentenceTransformer(model_name)
+        super().__init__()
     def forward(self, url: str, query: str) -> str:
+        doc = self._converter.convert(url).document
+        chunks = list(self._chunker.chunk(dl_doc=doc))
+        if not chunks:
+            return "No content found."
+        texts = [chunk.text for chunk in chunks]
+        contextual_chunks = [self._chunker.contextualize(c) for c in chunks]
+        context_texts = [ctx.replace(txt, "").strip() for txt, ctx in zip(texts, contextual_chunks)]
+        query_embedding = self._embedder.encode(
+            [q.strip() for q in query.split(",") if q.strip()],
+            convert_to_tensor=True,
+        )
+        matches = set()
+        for corpus in [texts, context_texts]:
+            embeddings = self._embedder.encode(corpus, convert_to_tensor=True)
+            for score in util.pytorch_cos_sim(query_embedding, embeddings):
+                probs = torch.nn.functional.softmax(score, dim=0)
+                sorted_idxs = torch.argsort(probs, descending=True)
+                cum_prob = 0.0
+                for idx in sorted_idxs:
+                    cum_prob += probs[idx].item()
+                    matches.add(idx.item())
+                    if cum_prob >= self.threshold:
                         break
+        if not matches:
+            return "No relevant chunks found."
+        selected_chunks = [contextual_chunks[i] for i in sorted(matches)]
+        return "\n\n".join(selected_chunks)