genaitiwari commited on
Commit
f46b6ca
·
1 Parent(s): a886d4b

code logic completed for uploaded files with qdrant

Browse files
Files changed (3) hide show
  1. Loaders/generic.py +56 -2
  2. app.py +34 -16
  3. config.ini +12 -1
Loaders/generic.py CHANGED
@@ -1,4 +1,6 @@
1
  #import
 
 
2
  from langchain_community.document_loaders.generic import GenericLoader
3
  from langchain_community.document_loaders.parsers import LanguageParser
4
  from langchain_text_splitters import Language
@@ -10,7 +12,8 @@ class GenericLoaders:
10
  self.main_config = main_config
11
  self.repo_path = repo_path
12
  self.rag_path_ext= rag_path_ext
13
-
 
14
  def loaders(self):
15
  # Load
16
  try :
@@ -29,10 +32,61 @@ class GenericLoaders:
29
  )
30
  documents = loader.load()
31
  print(len(documents))
32
-
33
  except Exception as e:
34
  print(e)
35
 
36
  return documents
37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
 
1
  #import
2
+ import os
3
+ import tempfile
4
  from langchain_community.document_loaders.generic import GenericLoader
5
  from langchain_community.document_loaders.parsers import LanguageParser
6
  from langchain_text_splitters import Language
 
12
  self.main_config = main_config
13
  self.repo_path = repo_path
14
  self.rag_path_ext= rag_path_ext
15
+
16
+ #when files are present in dir
17
  def loaders(self):
18
  # Load
19
  try :
 
32
  )
33
  documents = loader.load()
34
  print(len(documents))
 
35
  except Exception as e:
36
  print(e)
37
 
38
  return documents
39
 
40
+
41
+ # which files are uploading in streamlit
42
+ def code_loader_and_splitters(self):
43
+ documents = []
44
+ docs=[]
45
+ try :
46
+ code_files = self.repo_path
47
+ langlist = dict(self.main_config['Langlist'])
48
+ langlist = eval(langlist["lang_exts"]) # Convert the string to a dictionary
49
+ if code_files:
50
+ for code_file in code_files:
51
+ with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
52
+ tmp_file.write(code_file.read())
53
+
54
+ lang_ext = str(code_file.name.split(".")[-1])
55
+ lang_ext = str('.'+lang_ext)
56
+ tmp_file_path = tmp_file.name+lang_ext
57
+ tmp_file_path=tmp_file_path.replace("\\","/")
58
+ os.rename(tmp_file.name,tmp_file_path )
59
+ loader = GenericLoader.from_filesystem(
60
+ tmp_file_path,
61
+ glob="**/*",
62
+ suffixes=[lang_ext],
63
+ exclude=["**/non-utf8-encoding.py"],
64
+ parser=LanguageParser(parser_threshold=500),
65
+ )
66
+
67
+
68
+ selected_lang =""
69
+ for lang, ext in langlist.items():
70
+ if ext.lower() == lang_ext:
71
+ selected_lang = lang
72
+ print(f"selected lang: {selected_lang}") # For debugging
73
+
74
+ if selected_lang =="":
75
+ raise ValueError("Issue in evaluating techctack from extension")
76
+ documents.extend(loader.load())
77
+
78
+ splitter = RecursiveCharacterTextSplitter.from_language(
79
+ language=selected_lang, chunk_size=2000, chunk_overlap=200
80
+ )
81
+ docs.extend(splitter.split_documents(documents))
82
+
83
+ # Clean up temporary file
84
+ os.remove(tmp_file_path)
85
+
86
+ except Exception as e:
87
+ raise ValueError(f"Problem Occured in loader and splitter : {e}")
88
+
89
+ return docs
90
+
91
+
92
 
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  from langchain_text_splitters import TextSplitter
2
  import streamlit as st
3
  import pandas as pd
@@ -32,13 +33,17 @@ main_config.read(config_file_path)
32
  #Process Code Repo
33
  def process_code_repo(main_config,db_option,repo_path,groq_api_key,selected_model, rag_path_ext,query):
34
  if db_option == "Qdrant":
35
- # LOAD
36
- obj_loaders = GenericLoaders(main_config,repo_path, rag_path_ext)
37
- documents = obj_loaders.loaders()
38
 
39
- # TRANSFORM
40
- obj_code_splitters = CodeSplitters(main_config,repo_path,rag_path_ext)
41
- docs = obj_code_splitters.code_splitters(documents)
 
 
 
 
42
 
43
  #EMBEDDINGS and stored into VDB
44
  obj_qdr = QdrantVDB(main_config,repo_path,groq_api_key,selected_model, rag_path_ext,query)
@@ -133,17 +138,29 @@ db_option = st.sidebar.selectbox("Select Vector DB:", ["Qdrant", "Chroma"])
133
  st.sidebar.header("RAG Document Format")
134
  file_format = st.sidebar.selectbox("Select Input Document Format:", ["Code Repo", "PDF", "CSV"])
135
 
 
 
 
 
 
 
 
 
 
 
 
136
 
137
  if file_format == "Code Repo":
138
- st.sidebar.header("Select Language")
139
- lang_options = ["java", "python", "csharp"] # Replace with your actual models
140
- selected_lang = st.sidebar.selectbox("Select Model:", lang_options)
141
- if selected_lang == "java":
142
- rag_path_ext = ".java"
143
- elif selected_lang == "python":
144
- rag_path_ext = ".py"
145
- elif selected_lang == "csharp":
146
- rag_path_ext = ".cs"
 
147
 
148
  elif file_format == "PDF":
149
  rag_path_ext = ".pdf"
@@ -152,7 +169,8 @@ elif file_format == "CSV":
152
  rag_path_ext = ".csv"
153
 
154
 
155
- uploaded_files = st.file_uploader(f"Upload a {rag_path_ext} file", type=rag_path_ext,accept_multiple_files=True,)
 
156
 
157
 
158
  st.sidebar.header("Groq API Key")
 
1
+ from typing import Sequence
2
  from langchain_text_splitters import TextSplitter
3
  import streamlit as st
4
  import pandas as pd
 
33
  #Process Code Repo
34
  def process_code_repo(main_config,db_option,repo_path,groq_api_key,selected_model, rag_path_ext,query):
35
  if db_option == "Qdrant":
36
+ # # LOAD
37
+ # obj_loaders = GenericLoaders(main_config,repo_path, rag_path_ext)
38
+ # documents = obj_loaders.loaders()
39
 
40
+ # # TRANSFORM
41
+ # obj_code_splitters = CodeSplitters(main_config,repo_path,rag_path_ext)
42
+ # docs = obj_code_splitters.code_splitters(documents)
43
+
44
+ #LOAD and SPLIT
45
+ obj_loaders_splitter = GenericLoaders(main_config,repo_path, rag_path_ext)
46
+ docs = obj_loaders_splitter.code_loader_and_splitters()
47
 
48
  #EMBEDDINGS and stored into VDB
49
  obj_qdr = QdrantVDB(main_config,repo_path,groq_api_key,selected_model, rag_path_ext,query)
 
138
  st.sidebar.header("RAG Document Format")
139
  file_format = st.sidebar.selectbox("Select Input Document Format:", ["Code Repo", "PDF", "CSV"])
140
 
141
+ # Check if the section exists
142
+ if not main_config.has_section('Supported_Extensions'):
143
+ raise ValueError("Section 'Supported_Extensions' not found in config.ini")
144
+
145
+ # Extract extensions from the config dictionary
146
+ extensions = main_config['Supported_Extensions']['extensions']
147
+ supported_extensions = eval(extensions)
148
+ accepted_types = [f"{ext}" for ext in supported_extensions]
149
+
150
+ # Print the list of extensions
151
+ print(accepted_types)
152
 
153
  if file_format == "Code Repo":
154
+ # st.sidebar.header("Select Language")
155
+ # lang_options = ["java", "python", "csharp"] # Replace with your actual models
156
+ # selected_lang = st.sidebar.selectbox("Select Model:", lang_options)
157
+ # if selected_lang == "java":
158
+ # rag_path_ext = ".java"
159
+ # elif selected_lang == "python":
160
+ # rag_path_ext = ".py"
161
+ # elif selected_lang == "csharp":
162
+ # rag_path_ext = ".cs"
163
+ rag_path_ext = accepted_types
164
 
165
  elif file_format == "PDF":
166
  rag_path_ext = ".pdf"
 
169
  rag_path_ext = ".csv"
170
 
171
 
172
+
173
+ uploaded_files = st.file_uploader(f"Upload files", type=rag_path_ext,accept_multiple_files=True,)
174
 
175
 
176
  st.sidebar.header("Groq API Key")
config.ini CHANGED
@@ -1,2 +1,13 @@
1
  [Langlist]
2
- lang_exts = {"java":".java","csharp":".cs","python":".py"}
 
 
 
 
 
 
 
 
 
 
 
 
1
  [Langlist]
2
+ lang_exts = {"c": ".c", "cpp": ".cpp", "cobol": ".cbl", "elixir": ".ex",
3
+ "go": ".go", "haskell": ".hs", "html": ".html", "java": ".java",
4
+ "js": ".js", "kotlin": ".kt", "lua": ".lua", "latex": ".tex",
5
+ "markdown": ".md", "perl": ".pl", "php": ".php", "proto": ".proto",
6
+ "python": ".py", "rst": ".rst", "ruby": ".rb", "rust": ".rs",
7
+ "scala": ".scala", "sol": ".sol", "swift": ".swift", "csharp": ".cs"}
8
+
9
+
10
+ [Supported_Extensions]
11
+ extensions = [".cpp", ".go", ".java", ".kt", ".js", ".ts", ".php", ".proto", ".py", ".rst",
12
+ ".rb", ".rs", ".scala", ".swift", ".md", ".tex", ".html", ".sol", ".cs",
13
+ ".cob", ".c", ".lua", ".pl", ".hs", ".ex"]