SVashishta1
commited on
Commit
·
6950cd1
1
Parent(s):
a610301
Initial commit
Browse files- app.py +5 -5
- backend/db.py +3 -3
- backend/document_parser.py +10 -10
- backend/vector_db.py +5 -5
app.py
CHANGED
|
@@ -456,7 +456,7 @@ def clear_context():
|
|
| 456 |
except Exception as e:
|
| 457 |
return [{"role": "assistant", "content": f"Error clearing context: {str(e)}"}]
|
| 458 |
|
| 459 |
-
# I am making a function for voice input but we are not using it
|
| 460 |
"""
|
| 461 |
def process_voice_input(audio_path):
|
| 462 |
# I am checking if there is audio
|
|
@@ -484,7 +484,7 @@ def process_voice_input(audio_path):
|
|
| 484 |
return f"Error processing audio: {str(e)}"
|
| 485 |
"""
|
| 486 |
|
| 487 |
-
#
|
| 488 |
"""
|
| 489 |
def text_to_speech_output(text):
|
| 490 |
# I am checking if there is text
|
|
@@ -867,7 +867,7 @@ with gr.Blocks(title="AI Document Analysis & Voice Assistant") as demo:
|
|
| 867 |
clear_btn.click(lambda: None, None, [chatbot], queue=False)
|
| 868 |
clear_context_btn.click(clear_context, inputs=[], outputs=[chatbot])
|
| 869 |
|
| 870 |
-
# I am commenting out voice button click because
|
| 871 |
"""
|
| 872 |
voice_btn.click(
|
| 873 |
lambda: gr.update(visible=True),
|
|
@@ -876,7 +876,7 @@ with gr.Blocks(title="AI Document Analysis & Voice Assistant") as demo:
|
|
| 876 |
)
|
| 877 |
"""
|
| 878 |
|
| 879 |
-
# I am commenting out voice input change because
|
| 880 |
"""
|
| 881 |
voice_input.change(
|
| 882 |
process_voice_input,
|
|
@@ -885,7 +885,7 @@ with gr.Blocks(title="AI Document Analysis & Voice Assistant") as demo:
|
|
| 885 |
)
|
| 886 |
"""
|
| 887 |
|
| 888 |
-
# I am commenting out TTS button because
|
| 889 |
"""
|
| 890 |
tts_btn = gr.Button("🔊 Speak Response")
|
| 891 |
tts_btn.click(
|
|
|
|
| 456 |
except Exception as e:
|
| 457 |
return [{"role": "assistant", "content": f"Error clearing context: {str(e)}"}]
|
| 458 |
|
| 459 |
+
# I am making a function for voice input but we are not using it in this version(still in development phase)
|
| 460 |
"""
|
| 461 |
def process_voice_input(audio_path):
|
| 462 |
# I am checking if there is audio
|
|
|
|
| 484 |
return f"Error processing audio: {str(e)}"
|
| 485 |
"""
|
| 486 |
|
| 487 |
+
# a function for text to speech
|
| 488 |
"""
|
| 489 |
def text_to_speech_output(text):
|
| 490 |
# I am checking if there is text
|
|
|
|
| 867 |
clear_btn.click(lambda: None, None, [chatbot], queue=False)
|
| 868 |
clear_context_btn.click(clear_context, inputs=[], outputs=[chatbot])
|
| 869 |
|
| 870 |
+
# I am commenting out voice button click because it is still in development phase
|
| 871 |
"""
|
| 872 |
voice_btn.click(
|
| 873 |
lambda: gr.update(visible=True),
|
|
|
|
| 876 |
)
|
| 877 |
"""
|
| 878 |
|
| 879 |
+
# I am commenting out voice input change because it is still in development phase
|
| 880 |
"""
|
| 881 |
voice_input.change(
|
| 882 |
process_voice_input,
|
|
|
|
| 885 |
)
|
| 886 |
"""
|
| 887 |
|
| 888 |
+
# I am commenting out TTS button because it is still in development phase
|
| 889 |
"""
|
| 890 |
tts_btn = gr.Button("🔊 Speak Response")
|
| 891 |
tts_btn.click(
|
backend/db.py
CHANGED
|
@@ -28,7 +28,7 @@ class SimpleDB:
|
|
| 28 |
"""Add a document to the database"""
|
| 29 |
db = self._read_db()
|
| 30 |
|
| 31 |
-
#
|
| 32 |
doc_id = len(db["documents"]) + 1
|
| 33 |
|
| 34 |
# Add document
|
|
@@ -60,10 +60,10 @@ class SimpleDB:
|
|
| 60 |
"""Log a user query and its response"""
|
| 61 |
db = self._read_db()
|
| 62 |
|
| 63 |
-
#
|
| 64 |
query_id = len(db["queries"]) + 1
|
| 65 |
|
| 66 |
-
#
|
| 67 |
db["queries"].append({
|
| 68 |
"id": query_id,
|
| 69 |
"query_text": query_text,
|
|
|
|
| 28 |
"""Add a document to the database"""
|
| 29 |
db = self._read_db()
|
| 30 |
|
| 31 |
+
# Generating a simple ID
|
| 32 |
doc_id = len(db["documents"]) + 1
|
| 33 |
|
| 34 |
# Add document
|
|
|
|
| 60 |
"""Log a user query and its response"""
|
| 61 |
db = self._read_db()
|
| 62 |
|
| 63 |
+
# Generating a simple ID
|
| 64 |
query_id = len(db["queries"]) + 1
|
| 65 |
|
| 66 |
+
# Adding query
|
| 67 |
db["queries"].append({
|
| 68 |
"id": query_id,
|
| 69 |
"query_text": query_text,
|
backend/document_parser.py
CHANGED
|
@@ -22,17 +22,17 @@ class SimpleDocumentParser:
|
|
| 22 |
elif file_ext in ['.csv', '.xlsx', '.xls']:
|
| 23 |
return self.parse_tabular(file_path)
|
| 24 |
else:
|
| 25 |
-
|
| 26 |
return self.parse_text(file_path)
|
| 27 |
|
| 28 |
def parse_pdf(self, file_path: str) -> List[str]:
|
| 29 |
"""Parse PDF using PyMuPDF"""
|
| 30 |
chunks = []
|
| 31 |
try:
|
| 32 |
-
#
|
| 33 |
doc = fitz.open(file_path)
|
| 34 |
|
| 35 |
-
#
|
| 36 |
for page_num in range(len(doc)):
|
| 37 |
page = doc.load_page(page_num)
|
| 38 |
text = page.get_text()
|
|
@@ -57,7 +57,7 @@ class SimpleDocumentParser:
|
|
| 57 |
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
|
| 58 |
text = f.read()
|
| 59 |
|
| 60 |
-
#
|
| 61 |
paragraphs = text.split('\n\n')
|
| 62 |
for para in paragraphs:
|
| 63 |
if len(para.strip()) > 0:
|
|
@@ -74,7 +74,7 @@ class SimpleDocumentParser:
|
|
| 74 |
try:
|
| 75 |
doc = docx.Document(file_path)
|
| 76 |
|
| 77 |
-
#
|
| 78 |
for para in doc.paragraphs:
|
| 79 |
if len(para.text.strip()) > 0:
|
| 80 |
chunks.append(para.text.strip())
|
|
@@ -85,7 +85,7 @@ class SimpleDocumentParser:
|
|
| 85 |
return chunks
|
| 86 |
|
| 87 |
def parse_tabular(self, file_path: str) -> List[str]:
|
| 88 |
-
"""
|
| 89 |
chunks = []
|
| 90 |
try:
|
| 91 |
file_ext = os.path.splitext(file_path)[1].lower()
|
|
@@ -95,22 +95,22 @@ class SimpleDocumentParser:
|
|
| 95 |
else: # Excel files
|
| 96 |
df = pd.read_excel(file_path)
|
| 97 |
|
| 98 |
-
#
|
| 99 |
summary = f"Table with {len(df)} rows and {len(df.columns)} columns. "
|
| 100 |
summary += f"Columns: {', '.join(df.columns.tolist())}"
|
| 101 |
chunks.append(summary)
|
| 102 |
|
| 103 |
-
#
|
| 104 |
col_types = df.dtypes.to_dict()
|
| 105 |
col_desc = "Column details:\n"
|
| 106 |
for col, dtype in col_types.items():
|
| 107 |
-
#
|
| 108 |
sample_values = df[col].dropna().unique()[:3]
|
| 109 |
sample_str = ", ".join([str(v) for v in sample_values])
|
| 110 |
col_desc += f"- {col} (Type: {dtype}): Sample values: {sample_str}\n"
|
| 111 |
chunks.append(col_desc)
|
| 112 |
|
| 113 |
-
#
|
| 114 |
for index, row in df.head(50).iterrows():
|
| 115 |
row_text = " | ".join([f"{col}: {val}" for col, val in row.items()])
|
| 116 |
chunks.append(row_text)
|
|
|
|
| 22 |
elif file_ext in ['.csv', '.xlsx', '.xls']:
|
| 23 |
return self.parse_tabular(file_path)
|
| 24 |
else:
|
| 25 |
+
|
| 26 |
return self.parse_text(file_path)
|
| 27 |
|
| 28 |
def parse_pdf(self, file_path: str) -> List[str]:
|
| 29 |
"""Parse PDF using PyMuPDF"""
|
| 30 |
chunks = []
|
| 31 |
try:
|
| 32 |
+
# Opening the PDF
|
| 33 |
doc = fitz.open(file_path)
|
| 34 |
|
| 35 |
+
# Extracting text from each page
|
| 36 |
for page_num in range(len(doc)):
|
| 37 |
page = doc.load_page(page_num)
|
| 38 |
text = page.get_text()
|
|
|
|
| 57 |
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
|
| 58 |
text = f.read()
|
| 59 |
|
| 60 |
+
# Splitting by paragraphs
|
| 61 |
paragraphs = text.split('\n\n')
|
| 62 |
for para in paragraphs:
|
| 63 |
if len(para.strip()) > 0:
|
|
|
|
| 74 |
try:
|
| 75 |
doc = docx.Document(file_path)
|
| 76 |
|
| 77 |
+
# Extracting text from paragraphs
|
| 78 |
for para in doc.paragraphs:
|
| 79 |
if len(para.text.strip()) > 0:
|
| 80 |
chunks.append(para.text.strip())
|
|
|
|
| 85 |
return chunks
|
| 86 |
|
| 87 |
def parse_tabular(self, file_path: str) -> List[str]:
|
| 88 |
+
"""Parsing CSV or Excel files using pandas"""
|
| 89 |
chunks = []
|
| 90 |
try:
|
| 91 |
file_ext = os.path.splitext(file_path)[1].lower()
|
|
|
|
| 95 |
else: # Excel files
|
| 96 |
df = pd.read_excel(file_path)
|
| 97 |
|
| 98 |
+
# Adding table summary
|
| 99 |
summary = f"Table with {len(df)} rows and {len(df.columns)} columns. "
|
| 100 |
summary += f"Columns: {', '.join(df.columns.tolist())}"
|
| 101 |
chunks.append(summary)
|
| 102 |
|
| 103 |
+
# Adding column descriptions with data types
|
| 104 |
col_types = df.dtypes.to_dict()
|
| 105 |
col_desc = "Column details:\n"
|
| 106 |
for col, dtype in col_types.items():
|
| 107 |
+
# Adding sample values for each column (first 3 unique values)
|
| 108 |
sample_values = df[col].dropna().unique()[:3]
|
| 109 |
sample_str = ", ".join([str(v) for v in sample_values])
|
| 110 |
col_desc += f"- {col} (Type: {dtype}): Sample values: {sample_str}\n"
|
| 111 |
chunks.append(col_desc)
|
| 112 |
|
| 113 |
+
# Converting each row to a text chunk (limit to first 50 rows for indexing)
|
| 114 |
for index, row in df.head(50).iterrows():
|
| 115 |
row_text = " | ".join([f"{col}: {val}" for col, val in row.items()])
|
| 116 |
chunks.append(row_text)
|
backend/vector_db.py
CHANGED
|
@@ -42,8 +42,8 @@ class ChromaVectorDB:
|
|
| 42 |
return results
|
| 43 |
|
| 44 |
def delete_document(self, file_path: str):
|
| 45 |
-
"""
|
| 46 |
-
#
|
| 47 |
results = self.collection.get(
|
| 48 |
where={"source": file_path}
|
| 49 |
)
|
|
@@ -54,11 +54,11 @@ class ChromaVectorDB:
|
|
| 54 |
def reset_collection(self):
|
| 55 |
"""Reset the collection by clearing all documents"""
|
| 56 |
try:
|
| 57 |
-
#
|
| 58 |
try:
|
| 59 |
all_ids = self.collection.get()["ids"]
|
| 60 |
if all_ids:
|
| 61 |
-
#
|
| 62 |
self.collection.delete(ids=all_ids)
|
| 63 |
print(f"Deleted {len(all_ids)} documents from collection")
|
| 64 |
else:
|
|
@@ -67,7 +67,7 @@ class ChromaVectorDB:
|
|
| 67 |
except Exception as e:
|
| 68 |
print(f"Error getting or deleting documents: {str(e)}")
|
| 69 |
|
| 70 |
-
#
|
| 71 |
try:
|
| 72 |
self.client.delete_collection("documents")
|
| 73 |
self.collection = self.client.get_or_create_collection("documents")
|
|
|
|
| 42 |
return results
|
| 43 |
|
| 44 |
def delete_document(self, file_path: str):
|
| 45 |
+
"""Deleting all chunks from a specific document"""
|
| 46 |
+
# Getting all IDs related to this document
|
| 47 |
results = self.collection.get(
|
| 48 |
where={"source": file_path}
|
| 49 |
)
|
|
|
|
| 54 |
def reset_collection(self):
|
| 55 |
"""Reset the collection by clearing all documents"""
|
| 56 |
try:
|
| 57 |
+
# Getting all document IDs
|
| 58 |
try:
|
| 59 |
all_ids = self.collection.get()["ids"]
|
| 60 |
if all_ids:
|
| 61 |
+
# Deleting all documents
|
| 62 |
self.collection.delete(ids=all_ids)
|
| 63 |
print(f"Deleted {len(all_ids)} documents from collection")
|
| 64 |
else:
|
|
|
|
| 67 |
except Exception as e:
|
| 68 |
print(f"Error getting or deleting documents: {str(e)}")
|
| 69 |
|
| 70 |
+
# Trying to recreate the collection as a fallback
|
| 71 |
try:
|
| 72 |
self.client.delete_collection("documents")
|
| 73 |
self.collection = self.client.get_or_create_collection("documents")
|