Asish Karthikeya Gogineni commited on
Commit
5ceb659
·
1 Parent(s): 494f2e4

fix: Add reranking to prioritize source code over text files

Browse files

- New _rerank_by_file_type method in GraphEnhancedRetriever
- Boosts main entry points (app.py, main.py, index.js) to top
- Source code files (.py, .js, .ts) ranked higher than .txt files
- Fixes issue where prompt .txt files appeared before actual code

Files changed (1) hide show
  1. code_chatbot/graph_rag.py +38 -0
code_chatbot/graph_rag.py CHANGED
@@ -31,12 +31,50 @@ class GraphEnhancedRetriever(BaseRetriever):
31
  logger.warning(f"No AST graph found at {graph_path}")
32
  return None
33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  def _get_relevant_documents(self, query: str, *, run_manager=None) -> List[Document]:
35
  # 1. Standard Retrieval
36
  logger.info(f"GraphEnhancedRetriever: Querying base retriever with: '{query}'")
37
  docs = self.base_retriever.invoke(query)
38
  logger.info(f"GraphEnhancedRetriever: Base retriever returned {len(docs)} documents")
39
 
 
 
 
40
  if not self.graph:
41
  logger.warning("No AST graph available for enhancement")
42
  return docs
 
31
  logger.warning(f"No AST graph found at {graph_path}")
32
  return None
33
 
34
+ def _rerank_by_file_type(self, docs: List[Document]) -> List[Document]:
35
+ """Rerank documents to prioritize source code over config/text files."""
36
+
37
+ # Priority weights: higher = more important
38
+ def get_priority(doc: Document) -> int:
39
+ file_path = doc.metadata.get("file_path", "").lower()
40
+
41
+ # Highest priority: Main entry points
42
+ main_files = ["main.py", "app.py", "index.js", "index.ts", "server.py", "api.py"]
43
+ if any(file_path.endswith(f) for f in main_files):
44
+ return 100
45
+
46
+ # High priority: Source code files
47
+ code_extensions = [".py", ".js", ".ts", ".jsx", ".tsx", ".java", ".go", ".rs", ".cpp", ".c"]
48
+ if any(file_path.endswith(ext) for ext in code_extensions):
49
+ return 80
50
+
51
+ # Medium priority: Config files (still useful)
52
+ config_extensions = [".json", ".yaml", ".yml", ".toml"]
53
+ if any(file_path.endswith(ext) for ext in config_extensions):
54
+ return 50
55
+
56
+ # Lower priority: Text/doc files (often too generic)
57
+ text_extensions = [".txt", ".md", ".rst"]
58
+ if any(file_path.endswith(ext) for ext in text_extensions):
59
+ return 30
60
+
61
+ # Default
62
+ return 40
63
+
64
+ # Sort by priority (descending), keeping relative order for same priority
65
+ ranked = sorted(docs, key=lambda d: get_priority(d), reverse=True)
66
+ logger.info(f"Reranked docs: top files are {[d.metadata.get('file_path', '?').split('/')[-1] for d in ranked[:3]]}")
67
+ return ranked
68
+
69
  def _get_relevant_documents(self, query: str, *, run_manager=None) -> List[Document]:
70
  # 1. Standard Retrieval
71
  logger.info(f"GraphEnhancedRetriever: Querying base retriever with: '{query}'")
72
  docs = self.base_retriever.invoke(query)
73
  logger.info(f"GraphEnhancedRetriever: Base retriever returned {len(docs)} documents")
74
 
75
+ # 2. Rerank: Prioritize source code over config/text files
76
+ docs = self._rerank_by_file_type(docs)
77
+
78
  if not self.graph:
79
  logger.warning("No AST graph available for enhancement")
80
  return docs