Spaces:
Running
Running
Asish Karthikeya Gogineni commited on
Commit ·
5ceb659
1
Parent(s): 494f2e4
fix: Add reranking to prioritize source code over text files
Browse files- New _rerank_by_file_type method in GraphEnhancedRetriever
- Boosts main entry points (app.py, main.py, index.js) to top
- Source code files (.py, .js, .ts) ranked higher than .txt files
- Fixes issue where prompt .txt files appeared before actual code
- code_chatbot/graph_rag.py +38 -0
code_chatbot/graph_rag.py
CHANGED
|
@@ -31,12 +31,50 @@ class GraphEnhancedRetriever(BaseRetriever):
|
|
| 31 |
logger.warning(f"No AST graph found at {graph_path}")
|
| 32 |
return None
|
| 33 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
def _get_relevant_documents(self, query: str, *, run_manager=None) -> List[Document]:
|
| 35 |
# 1. Standard Retrieval
|
| 36 |
logger.info(f"GraphEnhancedRetriever: Querying base retriever with: '{query}'")
|
| 37 |
docs = self.base_retriever.invoke(query)
|
| 38 |
logger.info(f"GraphEnhancedRetriever: Base retriever returned {len(docs)} documents")
|
| 39 |
|
|
|
|
|
|
|
|
|
|
| 40 |
if not self.graph:
|
| 41 |
logger.warning("No AST graph available for enhancement")
|
| 42 |
return docs
|
|
|
|
| 31 |
logger.warning(f"No AST graph found at {graph_path}")
|
| 32 |
return None
|
| 33 |
|
| 34 |
+
def _rerank_by_file_type(self, docs: List[Document]) -> List[Document]:
|
| 35 |
+
"""Rerank documents to prioritize source code over config/text files."""
|
| 36 |
+
|
| 37 |
+
# Priority weights: higher = more important
|
| 38 |
+
def get_priority(doc: Document) -> int:
|
| 39 |
+
file_path = doc.metadata.get("file_path", "").lower()
|
| 40 |
+
|
| 41 |
+
# Highest priority: Main entry points
|
| 42 |
+
main_files = ["main.py", "app.py", "index.js", "index.ts", "server.py", "api.py"]
|
| 43 |
+
if any(file_path.endswith(f) for f in main_files):
|
| 44 |
+
return 100
|
| 45 |
+
|
| 46 |
+
# High priority: Source code files
|
| 47 |
+
code_extensions = [".py", ".js", ".ts", ".jsx", ".tsx", ".java", ".go", ".rs", ".cpp", ".c"]
|
| 48 |
+
if any(file_path.endswith(ext) for ext in code_extensions):
|
| 49 |
+
return 80
|
| 50 |
+
|
| 51 |
+
# Medium priority: Config files (still useful)
|
| 52 |
+
config_extensions = [".json", ".yaml", ".yml", ".toml"]
|
| 53 |
+
if any(file_path.endswith(ext) for ext in config_extensions):
|
| 54 |
+
return 50
|
| 55 |
+
|
| 56 |
+
# Lower priority: Text/doc files (often too generic)
|
| 57 |
+
text_extensions = [".txt", ".md", ".rst"]
|
| 58 |
+
if any(file_path.endswith(ext) for ext in text_extensions):
|
| 59 |
+
return 30
|
| 60 |
+
|
| 61 |
+
# Default
|
| 62 |
+
return 40
|
| 63 |
+
|
| 64 |
+
# Sort by priority (descending), keeping relative order for same priority
|
| 65 |
+
ranked = sorted(docs, key=lambda d: get_priority(d), reverse=True)
|
| 66 |
+
logger.info(f"Reranked docs: top files are {[d.metadata.get('file_path', '?').split('/')[-1] for d in ranked[:3]]}")
|
| 67 |
+
return ranked
|
| 68 |
+
|
| 69 |
def _get_relevant_documents(self, query: str, *, run_manager=None) -> List[Document]:
|
| 70 |
# 1. Standard Retrieval
|
| 71 |
logger.info(f"GraphEnhancedRetriever: Querying base retriever with: '{query}'")
|
| 72 |
docs = self.base_retriever.invoke(query)
|
| 73 |
logger.info(f"GraphEnhancedRetriever: Base retriever returned {len(docs)} documents")
|
| 74 |
|
| 75 |
+
# 2. Rerank: Prioritize source code over config/text files
|
| 76 |
+
docs = self._rerank_by_file_type(docs)
|
| 77 |
+
|
| 78 |
if not self.graph:
|
| 79 |
logger.warning("No AST graph available for enhancement")
|
| 80 |
return docs
|