SVashishta1 commited on
Commit
6950cd1
·
1 Parent(s): a610301

Initial commit

Browse files
Files changed (4) hide show
  1. app.py +5 -5
  2. backend/db.py +3 -3
  3. backend/document_parser.py +10 -10
  4. backend/vector_db.py +5 -5
app.py CHANGED
@@ -456,7 +456,7 @@ def clear_context():
456
  except Exception as e:
457
  return [{"role": "assistant", "content": f"Error clearing context: {str(e)}"}]
458
 
459
- # I am making a function for voice input but we are not using it
460
  """
461
  def process_voice_input(audio_path):
462
  # I am checking if there is audio
@@ -484,7 +484,7 @@ def process_voice_input(audio_path):
484
  return f"Error processing audio: {str(e)}"
485
  """
486
 
487
- # I am making a function for text to speech but we are not using it
488
  """
489
  def text_to_speech_output(text):
490
  # I am checking if there is text
@@ -867,7 +867,7 @@ with gr.Blocks(title="AI Document Analysis & Voice Assistant") as demo:
867
  clear_btn.click(lambda: None, None, [chatbot], queue=False)
868
  clear_context_btn.click(clear_context, inputs=[], outputs=[chatbot])
869
 
870
- # I am commenting out voice button click because we are not using it
871
  """
872
  voice_btn.click(
873
  lambda: gr.update(visible=True),
@@ -876,7 +876,7 @@ with gr.Blocks(title="AI Document Analysis & Voice Assistant") as demo:
876
  )
877
  """
878
 
879
- # I am commenting out voice input change because we are not using it
880
  """
881
  voice_input.change(
882
  process_voice_input,
@@ -885,7 +885,7 @@ with gr.Blocks(title="AI Document Analysis & Voice Assistant") as demo:
885
  )
886
  """
887
 
888
- # I am commenting out TTS button because we are not using it
889
  """
890
  tts_btn = gr.Button("🔊 Speak Response")
891
  tts_btn.click(
 
456
  except Exception as e:
457
  return [{"role": "assistant", "content": f"Error clearing context: {str(e)}"}]
458
 
459
+ # I am making a function for voice input but we are not using it in this version(still in development phase)
460
  """
461
  def process_voice_input(audio_path):
462
  # I am checking if there is audio
 
484
  return f"Error processing audio: {str(e)}"
485
  """
486
 
487
+ # a function for text to speech
488
  """
489
  def text_to_speech_output(text):
490
  # I am checking if there is text
 
867
  clear_btn.click(lambda: None, None, [chatbot], queue=False)
868
  clear_context_btn.click(clear_context, inputs=[], outputs=[chatbot])
869
 
870
+ # I am commenting out voice button click because it is still in development phase
871
  """
872
  voice_btn.click(
873
  lambda: gr.update(visible=True),
 
876
  )
877
  """
878
 
879
+ # I am commenting out voice input change because it is still in development phase
880
  """
881
  voice_input.change(
882
  process_voice_input,
 
885
  )
886
  """
887
 
888
+ # I am commenting out TTS button because it is still in development phase
889
  """
890
  tts_btn = gr.Button("🔊 Speak Response")
891
  tts_btn.click(
backend/db.py CHANGED
@@ -28,7 +28,7 @@ class SimpleDB:
28
  """Add a document to the database"""
29
  db = self._read_db()
30
 
31
- # Generate a simple ID
32
  doc_id = len(db["documents"]) + 1
33
 
34
  # Add document
@@ -60,10 +60,10 @@ class SimpleDB:
60
  """Log a user query and its response"""
61
  db = self._read_db()
62
 
63
- # Generate a simple ID
64
  query_id = len(db["queries"]) + 1
65
 
66
- # Add query
67
  db["queries"].append({
68
  "id": query_id,
69
  "query_text": query_text,
 
28
  """Add a document to the database"""
29
  db = self._read_db()
30
 
31
+ # Generating a simple ID
32
  doc_id = len(db["documents"]) + 1
33
 
34
  # Add document
 
60
  """Log a user query and its response"""
61
  db = self._read_db()
62
 
63
+ # Generating a simple ID
64
  query_id = len(db["queries"]) + 1
65
 
66
+ # Adding query
67
  db["queries"].append({
68
  "id": query_id,
69
  "query_text": query_text,
backend/document_parser.py CHANGED
@@ -22,17 +22,17 @@ class SimpleDocumentParser:
22
  elif file_ext in ['.csv', '.xlsx', '.xls']:
23
  return self.parse_tabular(file_path)
24
  else:
25
- # Default to text parsing
26
  return self.parse_text(file_path)
27
 
28
  def parse_pdf(self, file_path: str) -> List[str]:
29
  """Parse PDF using PyMuPDF"""
30
  chunks = []
31
  try:
32
- # Open the PDF
33
  doc = fitz.open(file_path)
34
 
35
- # Extract text from each page
36
  for page_num in range(len(doc)):
37
  page = doc.load_page(page_num)
38
  text = page.get_text()
@@ -57,7 +57,7 @@ class SimpleDocumentParser:
57
  with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
58
  text = f.read()
59
 
60
- # Split by paragraphs
61
  paragraphs = text.split('\n\n')
62
  for para in paragraphs:
63
  if len(para.strip()) > 0:
@@ -74,7 +74,7 @@ class SimpleDocumentParser:
74
  try:
75
  doc = docx.Document(file_path)
76
 
77
- # Extract text from paragraphs
78
  for para in doc.paragraphs:
79
  if len(para.text.strip()) > 0:
80
  chunks.append(para.text.strip())
@@ -85,7 +85,7 @@ class SimpleDocumentParser:
85
  return chunks
86
 
87
  def parse_tabular(self, file_path: str) -> List[str]:
88
- """Parse CSV or Excel files using pandas"""
89
  chunks = []
90
  try:
91
  file_ext = os.path.splitext(file_path)[1].lower()
@@ -95,22 +95,22 @@ class SimpleDocumentParser:
95
  else: # Excel files
96
  df = pd.read_excel(file_path)
97
 
98
- # Add table summary
99
  summary = f"Table with {len(df)} rows and {len(df.columns)} columns. "
100
  summary += f"Columns: {', '.join(df.columns.tolist())}"
101
  chunks.append(summary)
102
 
103
- # Add column descriptions with data types
104
  col_types = df.dtypes.to_dict()
105
  col_desc = "Column details:\n"
106
  for col, dtype in col_types.items():
107
- # Add sample values for each column (first 3 unique values)
108
  sample_values = df[col].dropna().unique()[:3]
109
  sample_str = ", ".join([str(v) for v in sample_values])
110
  col_desc += f"- {col} (Type: {dtype}): Sample values: {sample_str}\n"
111
  chunks.append(col_desc)
112
 
113
- # Convert each row to a text chunk (limit to first 50 rows for indexing)
114
  for index, row in df.head(50).iterrows():
115
  row_text = " | ".join([f"{col}: {val}" for col, val in row.items()])
116
  chunks.append(row_text)
 
22
  elif file_ext in ['.csv', '.xlsx', '.xls']:
23
  return self.parse_tabular(file_path)
24
  else:
25
+
26
  return self.parse_text(file_path)
27
 
28
  def parse_pdf(self, file_path: str) -> List[str]:
29
  """Parse PDF using PyMuPDF"""
30
  chunks = []
31
  try:
32
+ # Opening the PDF
33
  doc = fitz.open(file_path)
34
 
35
+ # Extracting text from each page
36
  for page_num in range(len(doc)):
37
  page = doc.load_page(page_num)
38
  text = page.get_text()
 
57
  with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
58
  text = f.read()
59
 
60
+ # Splitting by paragraphs
61
  paragraphs = text.split('\n\n')
62
  for para in paragraphs:
63
  if len(para.strip()) > 0:
 
74
  try:
75
  doc = docx.Document(file_path)
76
 
77
+ # Extracting text from paragraphs
78
  for para in doc.paragraphs:
79
  if len(para.text.strip()) > 0:
80
  chunks.append(para.text.strip())
 
85
  return chunks
86
 
87
  def parse_tabular(self, file_path: str) -> List[str]:
88
+ """Parsing CSV or Excel files using pandas"""
89
  chunks = []
90
  try:
91
  file_ext = os.path.splitext(file_path)[1].lower()
 
95
  else: # Excel files
96
  df = pd.read_excel(file_path)
97
 
98
+ # Adding table summary
99
  summary = f"Table with {len(df)} rows and {len(df.columns)} columns. "
100
  summary += f"Columns: {', '.join(df.columns.tolist())}"
101
  chunks.append(summary)
102
 
103
+ # Adding column descriptions with data types
104
  col_types = df.dtypes.to_dict()
105
  col_desc = "Column details:\n"
106
  for col, dtype in col_types.items():
107
+ # Adding sample values for each column (first 3 unique values)
108
  sample_values = df[col].dropna().unique()[:3]
109
  sample_str = ", ".join([str(v) for v in sample_values])
110
  col_desc += f"- {col} (Type: {dtype}): Sample values: {sample_str}\n"
111
  chunks.append(col_desc)
112
 
113
+ # Converting each row to a text chunk (limit to first 50 rows for indexing)
114
  for index, row in df.head(50).iterrows():
115
  row_text = " | ".join([f"{col}: {val}" for col, val in row.items()])
116
  chunks.append(row_text)
backend/vector_db.py CHANGED
@@ -42,8 +42,8 @@ class ChromaVectorDB:
42
  return results
43
 
44
  def delete_document(self, file_path: str):
45
- """Delete all chunks from a specific document"""
46
- # Get all IDs related to this document
47
  results = self.collection.get(
48
  where={"source": file_path}
49
  )
@@ -54,11 +54,11 @@ class ChromaVectorDB:
54
  def reset_collection(self):
55
  """Reset the collection by clearing all documents"""
56
  try:
57
- # Get all document IDs
58
  try:
59
  all_ids = self.collection.get()["ids"]
60
  if all_ids:
61
- # Delete all documents
62
  self.collection.delete(ids=all_ids)
63
  print(f"Deleted {len(all_ids)} documents from collection")
64
  else:
@@ -67,7 +67,7 @@ class ChromaVectorDB:
67
  except Exception as e:
68
  print(f"Error getting or deleting documents: {str(e)}")
69
 
70
- # Try recreating the collection as a fallback
71
  try:
72
  self.client.delete_collection("documents")
73
  self.collection = self.client.get_or_create_collection("documents")
 
42
  return results
43
 
44
  def delete_document(self, file_path: str):
45
+ """Deleting all chunks from a specific document"""
46
+ # Getting all IDs related to this document
47
  results = self.collection.get(
48
  where={"source": file_path}
49
  )
 
54
  def reset_collection(self):
55
  """Reset the collection by clearing all documents"""
56
  try:
57
+ # Getting all document IDs
58
  try:
59
  all_ids = self.collection.get()["ids"]
60
  if all_ids:
61
+ # Deleting all documents
62
  self.collection.delete(ids=all_ids)
63
  print(f"Deleted {len(all_ids)} documents from collection")
64
  else:
 
67
  except Exception as e:
68
  print(f"Error getting or deleting documents: {str(e)}")
69
 
70
+ # Trying to recreate the collection as a fallback
71
  try:
72
  self.client.delete_collection("documents")
73
  self.collection = self.client.get_or_create_collection("documents")