HimanshuGoyal2004 commited on
Commit
1598ff7
Β·
1 Parent(s): 943209d
Files changed (1) hide show
  1. app.py +82 -18
app.py CHANGED
@@ -6,7 +6,53 @@ import requests
6
  import gradio as gr
7
  from dotenv import load_dotenv
8
  from datasets import load_dataset
9
- from langchain_core.documents import Document
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  from langchain_community.retrievers import BM25Retriever
11
 
12
  # Load environment variables
@@ -81,25 +127,43 @@ class GitHubMCPServer:
81
  return
82
 
83
  # Split documents into smaller chunks for better retrieval
84
- text_splitter = RecursiveCharacterTextSplitter(
85
- chunk_size=500, # Characters per chunk
86
- chunk_overlap=50, # Overlap between chunks to maintain context
87
- add_start_index=True,
88
- strip_whitespace=True,
89
- separators=["\n\n", "\n", ".", " ", ""], # Priority order for splitting
90
- )
91
-
92
- docs_processed = text_splitter.split_documents(source_docs)
93
-
94
- print(f"πŸ“š Knowledge base prepared with {len(docs_processed)} document chunks")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
 
96
  # Initialize BM25 retriever
97
- self.cve_retriever = BM25Retriever.from_documents(
98
- docs_processed,
99
- k=3
100
- )
101
-
102
- print(f"βœ… CVE Retriever initialized with {len(docs_processed)} document chunks")
 
 
 
 
103
 
104
  except Exception as e:
105
  print(f"❌ Error initializing CVE retriever: {str(e)}")
 
6
  import gradio as gr
7
  from dotenv import load_dotenv
8
  from datasets import load_dataset
9
+ try:
10
+ from langchain_core.documents import Document
11
+ except ImportError:
12
+ try:
13
+ from langchain.docstore.document import Document
14
+ except ImportError:
15
+ try:
16
+ from langchain.schema import Document
17
+ except ImportError:
18
+ # Fallback: Create a simple Document class
19
+ class Document:
20
+ def __init__(self, page_content: str, metadata: dict = None):
21
+ self.page_content = page_content
22
+ self.metadata = metadata or {}
23
+
24
+ # Import RecursiveCharacterTextSplitter with fallback
25
+ RecursiveCharacterTextSplitter = None
26
+
27
+ try:
28
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
29
+ print("βœ… Using langchain.text_splitter.RecursiveCharacterTextSplitter")
30
+ except ImportError:
31
+ try:
32
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
33
+ print("βœ… Using langchain_text_splitters.RecursiveCharacterTextSplitter")
34
+ except ImportError:
35
+ print("⚠️ Using fallback RecursiveCharacterTextSplitter")
36
+ # Fallback: Simple text splitter
37
+ class RecursiveCharacterTextSplitter:
38
+ def __init__(self, chunk_size=500, chunk_overlap=50, **kwargs):
39
+ self.chunk_size = chunk_size
40
+ self.chunk_overlap = chunk_overlap
41
+ print(f"πŸ“ Initialized fallback text splitter with chunk_size={chunk_size}")
42
+
43
+ def split_documents(self, documents):
44
+ """Simple document splitting fallback"""
45
+ print(f"πŸ”„ Splitting {len(documents)} documents using fallback method...")
46
+ result = []
47
+ for doc in documents:
48
+ text = doc.page_content
49
+ # Simple chunking
50
+ for i in range(0, len(text), self.chunk_size - self.chunk_overlap):
51
+ chunk = text[i:i + self.chunk_size]
52
+ if chunk.strip():
53
+ result.append(Document(page_content=chunk, metadata=doc.metadata))
54
+ print(f"βœ… Split into {len(result)} chunks")
55
+ return result
56
  from langchain_community.retrievers import BM25Retriever
57
 
58
  # Load environment variables
 
127
  return
128
 
129
  # Split documents into smaller chunks for better retrieval
130
+ print("πŸ”„ Initializing text splitter...")
131
+ try:
132
+ text_splitter = RecursiveCharacterTextSplitter(
133
+ chunk_size=500, # Characters per chunk
134
+ chunk_overlap=50, # Overlap between chunks to maintain context
135
+ add_start_index=True,
136
+ strip_whitespace=True,
137
+ separators=["\n\n", "\n", ".", " ", ""], # Priority order for splitting
138
+ )
139
+ print("βœ… Text splitter initialized successfully")
140
+ except Exception as splitter_error:
141
+ print(f"❌ Text splitter initialization failed: {splitter_error}")
142
+ # Use simple fallback
143
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
144
+ print("βœ… Using simple fallback text splitter")
145
+
146
+ print("πŸ”„ Processing documents with text splitter...")
147
+ try:
148
+ docs_processed = text_splitter.split_documents(source_docs)
149
+ print(f"πŸ“š Knowledge base prepared with {len(docs_processed)} document chunks")
150
+ except Exception as processing_error:
151
+ print(f"❌ Document processing failed: {processing_error}")
152
+ # Use original documents without splitting as fallback
153
+ docs_processed = source_docs
154
+ print(f"βœ… Using original documents without splitting: {len(docs_processed)} documents")
155
 
156
  # Initialize BM25 retriever
157
+ print("πŸ”„ Initializing BM25 retriever...")
158
+ try:
159
+ self.cve_retriever = BM25Retriever.from_documents(
160
+ docs_processed,
161
+ k=3
162
+ )
163
+ print(f"βœ… CVE Retriever initialized with {len(docs_processed)} document chunks")
164
+ except Exception as retriever_error:
165
+ print(f"❌ BM25 retriever initialization failed: {retriever_error}")
166
+ self.cve_retriever = None
167
 
168
  except Exception as e:
169
  print(f"❌ Error initializing CVE retriever: {str(e)}")