NeelTA commited on
Commit
d2fe6cc
·
1 Parent(s): 18f151e

initial commit

Browse files
Files changed (12) hide show
  1. .gitignore +225 -0
  2. Dockerfile +16 -0
  3. docHandler.py +141 -0
  4. frontend/css/styles.css +466 -0
  5. frontend/index.html +88 -0
  6. frontend/js/main.js +493 -0
  7. main.py +269 -0
  8. pdfHandler.py +141 -0
  9. requirements.txt +42 -0
  10. task_manager.py +145 -0
  11. txtHandler.py +141 -0
  12. webHandler.py +110 -0
.gitignore ADDED
@@ -0,0 +1,225 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Node.js
2
+ node_modules/
3
+ npm-debug.log*
4
+ yarn-debug.log*
5
+ yarn-error.log*
6
+ .npm
7
+ .env.development.local
8
+ .env.test.local
9
+ .env.production.local
10
+ .env.local
11
+
12
+ # Byte-compiled / optimized / DLL files
13
+ __pycache__/
14
+ *.py[codz]
15
+ *$py.class
16
+
17
+ # C extensions
18
+ *.so
19
+
20
+ # Distribution / packaging
21
+ .Python
22
+ build/
23
+ develop-eggs/
24
+ dist/
25
+ downloads/
26
+ eggs/
27
+ .eggs/
28
+ lib/
29
+ lib64/
30
+ parts/
31
+ sdist/
32
+ var/
33
+ wheels/
34
+ share/python-wheels/
35
+ *.egg-info/
36
+ .installed.cfg
37
+ *.egg
38
+ MANIFEST
39
+
40
+ # PyInstaller
41
+ # Usually these files are written by a python script from a template
42
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
43
+ *.manifest
44
+ *.spec
45
+
46
+ # Installer logs
47
+ pip-log.txt
48
+ pip-delete-this-directory.txt
49
+
50
+ # Unit test / coverage reports
51
+ htmlcov/
52
+ .tox/
53
+ .nox/
54
+ .coverage
55
+ .coverage.*
56
+ .cache
57
+ nosetests.xml
58
+ coverage.xml
59
+ *.cover
60
+ *.py.cover
61
+ .hypothesis/
62
+ .pytest_cache/
63
+ cover/
64
+
65
+ # Translations
66
+ *.mo
67
+ *.pot
68
+
69
+ # Django stuff:
70
+ *.log
71
+ local_settings.py
72
+ db.sqlite3
73
+ db.sqlite3-journal
74
+
75
+ # Flask stuff:
76
+ instance/
77
+ .webassets-cache
78
+
79
+ # Scrapy stuff:
80
+ .scrapy
81
+
82
+ # Sphinx documentation
83
+ docs/_build/
84
+
85
+ # PyBuilder
86
+ .pybuilder/
87
+ target/
88
+
89
+ # Jupyter Notebook
90
+ .ipynb_checkpoints
91
+
92
+ # IPython
93
+ profile_default/
94
+ ipython_config.py
95
+
96
+ # pyenv
97
+ # For a library or package, you might want to ignore these files since the code is
98
+ # intended to run in multiple environments; otherwise, check them in:
99
+ # .python-version
100
+
101
+ # pipenv
102
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
103
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
104
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
105
+ # install all needed dependencies.
106
+ #Pipfile.lock
107
+
108
+ # UV
109
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
110
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
111
+ # commonly ignored for libraries.
112
+ #uv.lock
113
+
114
+ # poetry
115
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
116
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
117
+ # commonly ignored for libraries.
118
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
119
+ #poetry.lock
120
+ #poetry.toml
121
+
122
+ # pdm
123
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
124
+ # pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
125
+ # https://pdm-project.org/en/latest/usage/project/#working-with-version-control
126
+ #pdm.lock
127
+ #pdm.toml
128
+ .pdm-python
129
+ .pdm-build/
130
+
131
+ # pixi
132
+ # Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
133
+ #pixi.lock
134
+ # Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
135
+ # in the .venv directory. It is recommended not to include this directory in version control.
136
+ .pixi
137
+
138
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
139
+ __pypackages__/
140
+
141
+ # Celery stuff
142
+ celerybeat-schedule
143
+ celerybeat.pid
144
+
145
+ # SageMath parsed files
146
+ *.sage.py
147
+
148
+ # Environments
149
+ .env
150
+ .envrc
151
+ .venv
152
+ env/
153
+ venv/
154
+ ENV/
155
+ env.bak/
156
+ venv.bak/
157
+
158
+ # Spyder project settings
159
+ .spyderproject
160
+ .spyproject
161
+
162
+ # Rope project settings
163
+ .ropeproject
164
+
165
+ # mkdocs documentation
166
+ /site
167
+
168
+ # mypy
169
+ .mypy_cache/
170
+ .dmypy.json
171
+ dmypy.json
172
+
173
+ # Pyre type checker
174
+ .pyre/
175
+
176
+ # pytype static type analyzer
177
+ .pytype/
178
+
179
+ # Cython debug symbols
180
+ cython_debug/
181
+
182
+ # PyCharm
183
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
184
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
185
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
186
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
187
+ #.idea/
188
+
189
+ # Abstra
190
+ # Abstra is an AI-powered process automation framework.
191
+ # Ignore directories containing user credentials, local state, and settings.
192
+ # Learn more at https://abstra.io/docs
193
+ .abstra/
194
+
195
+ # Visual Studio Code
196
+ # Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
197
+ # that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
198
+ # and can be added to the global gitignore or merged into this file. However, if you prefer,
199
+ # you could uncomment the following to ignore the entire vscode folder
200
+ # .vscode/
201
+
202
+ # Ruff stuff:
203
+ .ruff_cache/
204
+
205
+ # PyPI configuration file
206
+ .pypirc
207
+
208
+ # Cursor
209
+ # Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
210
+ # exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
211
+ # refer to https://docs.cursor.com/context/ignore-files
212
+ .cursorignore
213
+ .cursorindexingignore
214
+
215
+ # Marimo
216
+ marimo/_static/
217
+ marimo/_lsp/
218
+ __marimo__/
219
+
220
+ # ChatWithDoc specific
221
+ chatWithDocEnv/
222
+ uploaded_files/
223
+ .vscode/
224
+ .env
225
+ .env.local
Dockerfile ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
2
+ # you will also find guides on how best to write your Dockerfile
3
+
4
+ FROM python:3.9
5
+
6
+ RUN useradd -m -u 1000 user
7
+ USER user
8
+ ENV PATH="/home/user/.local/bin:$PATH"
9
+
10
+ WORKDIR /app
11
+
12
+ COPY --chown=user ./requirements.txt requirements.txt
13
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
14
+
15
+ COPY --chown=user . /app
16
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
docHandler.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_community.document_loaders import Docx2txtLoader
2
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
3
+ from langchain.chat_models import init_chat_model
4
+ from langchain_huggingface import HuggingFaceEmbeddings
5
+ import faiss
6
+ from langchain_community.docstore.in_memory import InMemoryDocstore
7
+ from langchain_community.vectorstores import FAISS
8
+ import os
9
+ from langchain import hub
10
+ from dotenv import load_dotenv
11
+ from langgraph.graph import START, StateGraph
12
+ from typing import List, Dict, Any, Optional
13
+ from pydantic import BaseModel, Field
14
+ from langchain.docstore.document import Document
15
+
16
+ load_dotenv()
17
+
18
+ class State(BaseModel):
19
+ question: str = Field(..., description="Type your question here")
20
+ context: List[Document] = Field(
21
+ default_factory=list,
22
+ description="A list of Document objects",
23
+ )
24
+ answer: str = Field(default="", description="Answer will be here")
25
+
26
+ class DocProcessor:
27
+ def __init__(self):
28
+ # Load model provider
29
+ if not os.environ.get("GOOGLE_API_KEY"):
30
+ raise ValueError("Google Gemini API key not found in environment variables")
31
+
32
+ self.llm = init_chat_model("gemini-2.5-flash", model_provider="google_genai")
33
+ self.embedding_model = HuggingFaceEmbeddings(
34
+ model_name="sentence-transformers/all-MiniLM-L6-v2",
35
+ model_kwargs={"device": "cpu"}
36
+ )
37
+ self.prompt = hub.pull("rlm/rag-prompt")
38
+ self.vector_store = None
39
+ self.chunk_size = 1000
40
+ self.chunk_overlap = 200
41
+
42
+
43
+ def process_docx(self, file_path: str) -> Dict[str, Any]:
44
+ """
45
+ Process a DOCX file and prepare it for querying
46
+
47
+ Args:
48
+ file_path (str): Path to the DOCX file
49
+
50
+ Returns:
51
+ Dict[str, Any]: Processing status and information
52
+ """
53
+ try:
54
+ # Document Loading
55
+ loader = Docx2txtLoader(file_path)
56
+ pages = loader.load()
57
+
58
+ # Text Splitting
59
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap)
60
+ texts = text_splitter.split_documents(pages)
61
+
62
+ # Vector Store Setup
63
+ embedding_dim = len(self.embedding_model.embed_query("test"))
64
+ index = faiss.IndexFlatL2(embedding_dim)
65
+
66
+ self.vector_store = FAISS(
67
+ embedding_function=self.embedding_model,
68
+ index=index,
69
+ docstore=InMemoryDocstore(),
70
+ index_to_docstore_id={},
71
+ )
72
+
73
+ # Index chunks
74
+ self.vector_store.add_documents(documents=texts)
75
+
76
+ return {
77
+ "status": "success",
78
+ "message": "DOCX processed successfully",
79
+ "num_pages": len(pages),
80
+ "num_chunks": len(texts)
81
+ }
82
+ except Exception as e:
83
+ return {
84
+ "status": "error",
85
+ "message": f"Error processing DOCX: {str(e)}"
86
+ }
87
+
88
+ def query_response(self, query: str) -> Dict[str, Any]:
89
+ """
90
+ Query the processed document
91
+
92
+ Args:
93
+ query (str): The question to ask about the document
94
+
95
+ Returns:
96
+ Dict[str, Any]: Answer and relevant context
97
+ """
98
+ if not self.vector_store:
99
+ return {
100
+ "status": "error",
101
+ "message": "No document has been processed yet"
102
+ }
103
+
104
+ try:
105
+ # Create state graph
106
+ graph_builder = StateGraph(State)
107
+
108
+ # Define retrieval step
109
+ def retrieve(state: State):
110
+ retrieved_docs = self.vector_store.similarity_search(state.question)
111
+ return {"context": retrieved_docs}
112
+
113
+ # Define generation step
114
+ def generate(state: State):
115
+ docs_content = "\n\n".join(doc.page_content for doc in state.context)
116
+ messages = self.prompt.invoke({
117
+ "question": state.question,
118
+ "context": docs_content
119
+ })
120
+ response = self.llm.invoke(messages)
121
+ return {"answer": response.content}
122
+
123
+ # Build and compile the graph
124
+ graph = graph_builder.add_sequence([retrieve, generate]).set_entry_point("retrieve").compile()
125
+
126
+ # Execute the query
127
+ response = graph.invoke({
128
+ "question": query
129
+ })
130
+
131
+ return {
132
+ "status": "success",
133
+ "answer": response["answer"],
134
+ "query": query
135
+ }
136
+ except Exception as e:
137
+ return {
138
+ "status": "error",
139
+ "message": f"Error querying document: {str(e)}"
140
+ }
141
+
frontend/css/styles.css ADDED
@@ -0,0 +1,466 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ :root {
2
+ --primary: #4361ee;
3
+ --primary-light: #4895ef;
4
+ --secondary: #3f37c9;
5
+ --accent: #4cc9f0;
6
+ --light: #f8f9fa;
7
+ --dark: #212529;
8
+ --success: #4ade80;
9
+ --warning: #facc15;
10
+ --danger: #f87171;
11
+ --gray: #6c757d;
12
+ --light-gray: #e9ecef;
13
+ --border-radius: 12px;
14
+ --shadow: 0 4px 20px rgba(0, 0, 0, 0.08);
15
+ --transition: all 0.3s ease;
16
+ }
17
+
18
+ * {
19
+ margin: 0;
20
+ padding: 0;
21
+ box-sizing: border-box;
22
+ font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
23
+ }
24
+
25
+ body {
26
+ background: linear-gradient(135deg, #f5f7fa 0%, #e4edf5 100%);
27
+ color: var(--dark);
28
+ min-height: 100vh;
29
+ padding: 20px;
30
+ display: flex;
31
+ flex-direction: column;
32
+ align-items: center;
33
+ }
34
+
35
+ .container {
36
+ width: 100%;
37
+ max-width: 1200px;
38
+ margin: 0 auto;
39
+ }
40
+
41
+ header {
42
+ text-align: center;
43
+ padding: 30px 0;
44
+ animation: fadeIn 0.8s ease-out;
45
+ }
46
+
47
+ header h1 {
48
+ font-size: 2.8rem;
49
+ margin-bottom: 10px;
50
+ color: var(--secondary);
51
+ background: linear-gradient(90deg, var(--primary), var(--accent));
52
+ -webkit-background-clip: text;
53
+ background-clip: text;
54
+ color: transparent;
55
+ }
56
+
57
+ header p {
58
+ font-size: 1.2rem;
59
+ color: var(--gray);
60
+ max-width: 600px;
61
+ margin: 0 auto;
62
+ }
63
+
64
+ .app-container {
65
+ display: flex;
66
+ gap: 30px;
67
+ margin-top: 20px;
68
+ }
69
+
70
+ @media (max-width: 900px) {
71
+ .app-container {
72
+ flex-direction: column;
73
+ }
74
+ }
75
+
76
+ .input-section {
77
+ flex: 1;
78
+ background: white;
79
+ border-radius: var(--border-radius);
80
+ padding: 25px;
81
+ box-shadow: var(--shadow);
82
+ animation: slideInLeft 0.6s ease-out;
83
+ }
84
+
85
+ .chat-section {
86
+ flex: 1.5;
87
+ display: flex;
88
+ flex-direction: column;
89
+ background: white;
90
+ border-radius: var(--border-radius);
91
+ box-shadow: var(--shadow);
92
+ overflow: hidden;
93
+ animation: slideInRight 0.6s ease-out;
94
+ }
95
+
96
+ .section-title {
97
+ font-size: 1.5rem;
98
+ margin-bottom: 20px;
99
+ color: var(--secondary);
100
+ display: flex;
101
+ align-items: center;
102
+ gap: 10px;
103
+ }
104
+
105
+ .section-title i {
106
+ background: var(--light-gray);
107
+ width: 40px;
108
+ height: 40px;
109
+ border-radius: 50%;
110
+ display: flex;
111
+ align-items: center;
112
+ justify-content: center;
113
+ }
114
+
115
+ .upload-area {
116
+ border: 2px dashed var(--light-gray);
117
+ border-radius: var(--border-radius);
118
+ padding: 30px;
119
+ text-align: center;
120
+ margin-bottom: 25px;
121
+ transition: var(--transition);
122
+ cursor: pointer;
123
+ }
124
+
125
+ .upload-area:hover {
126
+ border-color: var(--primary);
127
+ background: rgba(67, 97, 238, 0.05);
128
+ }
129
+
130
+ .upload-area i {
131
+ font-size: 3rem;
132
+ color: var(--primary);
133
+ margin-bottom: 15px;
134
+ }
135
+
136
+ .upload-area h3 {
137
+ margin-bottom: 10px;
138
+ color: var(--dark);
139
+ }
140
+
141
+ .upload-area p {
142
+ color: var(--gray);
143
+ margin-bottom: 20px;
144
+ }
145
+
146
+ .file-types {
147
+ display: flex;
148
+ justify-content: center;
149
+ gap: 15px;
150
+ margin-top: 15px;
151
+ }
152
+
153
+ .file-type {
154
+ background: var(--light-gray);
155
+ padding: 8px 15px;
156
+ border-radius: 30px;
157
+ font-size: 0.9rem;
158
+ }
159
+
160
+ .url-input {
161
+ margin-bottom: 25px;
162
+ }
163
+
164
+ .url-input label {
165
+ display: block;
166
+ margin-bottom: 8px;
167
+ font-weight: 500;
168
+ }
169
+
170
+ .url-input input {
171
+ width: 100%;
172
+ padding: 14px;
173
+ border: 1px solid var(--light-gray);
174
+ border-radius: var(--border-radius);
175
+ font-size: 1rem;
176
+ transition: var(--transition);
177
+ }
178
+
179
+ .url-input input:focus {
180
+ outline: none;
181
+ border-color: var(--primary);
182
+ box-shadow: 0 0 0 3px rgba(67, 97, 238, 0.2);
183
+ }
184
+
185
+ .btn {
186
+ background: var(--primary);
187
+ color: white;
188
+ border: none;
189
+ padding: 14px 25px;
190
+ border-radius: var(--border-radius);
191
+ font-size: 1rem;
192
+ font-weight: 600;
193
+ cursor: pointer;
194
+ transition: var(--transition);
195
+ display: inline-flex;
196
+ align-items: center;
197
+ justify-content: center;
198
+ gap: 8px;
199
+ }
200
+
201
+ .btn:hover {
202
+ background: var(--secondary);
203
+ transform: translateY(-2px);
204
+ }
205
+
206
+ .btn:active {
207
+ transform: translateY(0);
208
+ }
209
+
210
+ .btn-block {
211
+ width: 100%;
212
+ }
213
+
214
+ .btn-outline {
215
+ background: transparent;
216
+ border: 2px solid var(--primary);
217
+ color: var(--primary);
218
+ }
219
+
220
+ .btn-outline:hover {
221
+ background: var(--primary);
222
+ color: white;
223
+ }
224
+
225
+ .file-list {
226
+ margin-top: 25px;
227
+ }
228
+
229
+ .file-item {
230
+ display: flex;
231
+ align-items: center;
232
+ padding: 12px 15px;
233
+ background: var(--light-gray);
234
+ border-radius: var(--border-radius);
235
+ margin-bottom: 10px;
236
+ animation: fadeIn 0.3s ease-out;
237
+ }
238
+
239
+ .file-item i {
240
+ margin-right: 12px;
241
+ color: var(--primary);
242
+ }
243
+
244
+ .file-info {
245
+ flex: 1;
246
+ }
247
+
248
+ .file-name {
249
+ font-weight: 500;
250
+ margin-bottom: 3px;
251
+ }
252
+
253
+ .file-size {
254
+ font-size: 0.85rem;
255
+ color: var(--gray);
256
+ }
257
+
258
+ .file-actions {
259
+ display: flex;
260
+ gap: 10px;
261
+ }
262
+
263
+ .file-actions button {
264
+ background: none;
265
+ border: none;
266
+ color: var(--gray);
267
+ cursor: pointer;
268
+ font-size: 1.1rem;
269
+ transition: var(--transition);
270
+ }
271
+
272
+ .file-actions button:hover {
273
+ color: var(--danger);
274
+ }
275
+
276
+ .chat-header {
277
+ background: var(--primary);
278
+ color: white;
279
+ padding: 20px;
280
+ display: flex;
281
+ align-items: center;
282
+ gap: 15px;
283
+ }
284
+
285
+ .chat-header img {
286
+ width: 50px;
287
+ height: 50px;
288
+ border-radius: 50%;
289
+ background: white;
290
+ padding: 5px;
291
+ }
292
+
293
+ .chat-messages {
294
+ flex: 1;
295
+ padding: 25px;
296
+ overflow-y: auto;
297
+ display: flex;
298
+ flex-direction: column;
299
+ gap: 20px;
300
+ background: #f8fafc;
301
+ }
302
+
303
+ .message {
304
+ max-width: 80%;
305
+ padding: 18px;
306
+ border-radius: var(--border-radius);
307
+ animation: fadeIn 0.3s ease-out;
308
+ position: relative;
309
+ box-shadow: 0 2px 5px rgba(0, 0, 0, 0.05);
310
+ }
311
+
312
+ .user-message {
313
+ background: var(--primary-light);
314
+ color: white;
315
+ align-self: flex-end;
316
+ border-bottom-right-radius: 5px;
317
+ }
318
+
319
+ .bot-message {
320
+ background: white;
321
+ border: 1px solid var(--light-gray);
322
+ align-self: flex-start;
323
+ border-bottom-left-radius: 5px;
324
+ }
325
+
326
+ .message-header {
327
+ display: flex;
328
+ align-items: center;
329
+ margin-bottom: 8px;
330
+ font-weight: 600;
331
+ }
332
+
333
+ .message-header i {
334
+ margin-right: 8px;
335
+ }
336
+
337
+ .message-content {
338
+ line-height: 1.5;
339
+ }
340
+
341
+ .typing-indicator {
342
+ display: flex;
343
+ align-items: center;
344
+ padding: 18px;
345
+ background: white;
346
+ border: 1px solid var(--light-gray);
347
+ border-radius: var(--border-radius);
348
+ align-self: flex-start;
349
+ border-bottom-left-radius: 5px;
350
+ width: 100px;
351
+ }
352
+
353
+ .typing-dot {
354
+ width: 8px;
355
+ height: 8px;
356
+ background: var(--gray);
357
+ border-radius: 50%;
358
+ margin: 0 3px;
359
+ animation: typing 1.4s infinite ease-in-out;
360
+ }
361
+
362
+ .typing-dot:nth-child(1) { animation-delay: 0s; }
363
+ .typing-dot:nth-child(2) { animation-delay: 0.2s; }
364
+ .typing-dot:nth-child(3) { animation-delay: 0.4s; }
365
+
366
+ .chat-input {
367
+ display: flex;
368
+ padding: 20px;
369
+ background: white;
370
+ border-top: 1px solid var(--light-gray);
371
+ }
372
+
373
+ .chat-input input {
374
+ flex: 1;
375
+ padding: 16px;
376
+ border: 1px solid var(--light-gray);
377
+ border-radius: 30px;
378
+ font-size: 1rem;
379
+ transition: var(--transition);
380
+ }
381
+
382
+ .chat-input input:focus {
383
+ outline: none;
384
+ border-color: var(--primary);
385
+ box-shadow: 0 0 0 3px rgba(67, 97, 238, 0.2);
386
+ }
387
+
388
+ .chat-input button {
389
+ background: var(--primary);
390
+ color: white;
391
+ border: none;
392
+ width: 50px;
393
+ height: 50px;
394
+ border-radius: 50%;
395
+ margin-left: 15px;
396
+ cursor: pointer;
397
+ transition: var(--transition);
398
+ display: flex;
399
+ align-items: center;
400
+ justify-content: center;
401
+ }
402
+
403
+ .chat-input button:hover {
404
+ background: var(--secondary);
405
+ transform: scale(1.05);
406
+ }
407
+
408
+
409
+
410
+ @keyframes fadeIn {
411
+ from { opacity: 0; transform: translateY(10px); }
412
+ to { opacity: 1; transform: translateY(0); }
413
+ }
414
+
415
+ @keyframes slideInLeft {
416
+ from { opacity: 0; transform: translateX(-30px); }
417
+ to { opacity: 1; transform: translateX(0); }
418
+ }
419
+
420
+ @keyframes slideInRight {
421
+ from { opacity: 0; transform: translateX(30px); }
422
+ to { opacity: 1; transform: translateX(0); }
423
+ }
424
+
425
+ @keyframes typing {
426
+ 0%, 60%, 100% { transform: translateY(0); }
427
+ 30% { transform: translateY(-5px); }
428
+ }
429
+
430
+ .processing {
431
+ display: flex;
432
+ align-items: center;
433
+ justify-content: center;
434
+ padding: 30px;
435
+ color: var(--gray);
436
+ }
437
+
438
+ .processing i {
439
+ font-size: 2rem;
440
+ margin-right: 15px;
441
+ color: var(--primary);
442
+ animation: spin 1.5s linear infinite;
443
+ }
444
+
445
+ @keyframes spin {
446
+ 0% { transform: rotate(0deg); }
447
+ 100% { transform: rotate(360deg); }
448
+ }
449
+
450
+ footer {
451
+ text-align: center;
452
+ padding: 30px 0;
453
+ color: var(--gray);
454
+ font-size: 0.9rem;
455
+ margin-top: auto;
456
+ }
457
+
458
+ .pulse {
459
+ animation: pulse 2s infinite;
460
+ }
461
+
462
+ @keyframes pulse {
463
+ 0% { box-shadow: 0 0 0 0 rgba(67, 97, 238, 0.4); }
464
+ 70% { box-shadow: 0 0 0 10px rgba(67, 97, 238, 0); }
465
+ 100% { box-shadow: 0 0 0 0 rgba(67, 97, 238, 0); }
466
+ }
frontend/index.html ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>ChatWithDoc - Chat with Your Documents</title>
7
+ <link rel="stylesheet" href="css/styles.css">
8
+ <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.0/css/all.min.css">
9
+ </head>
10
+ <body>
11
+ <div class="container">
12
+ <header>
13
+ <h1><i class="fas fa-robot"></i> ChatWithDoc</h1>
14
+ <p>Upload documents or enter URLs, then chat with your content using AI</p>
15
+ </header>
16
+
17
+ <div class="app-container">
18
+ <div class="input-section">
19
+ <h2 class="section-title"><i class="fas fa-file-upload"></i> Upload Documents</h2>
20
+
21
+ <div class="upload-area" id="uploadArea">
22
+ <i class="fas fa-cloud-upload-alt"></i>
23
+ <h3>Drag & Drop Files Here</h3>
24
+ <p>Supports PDF, DOC, DOCX, TXT files</p>
25
+ <button class="btn btn-outline">Browse Files</button>
26
+ <input type="file" id="fileInput" multiple accept=".pdf,.doc,.docx,.txt" style="display: none;">
27
+
28
+ <div class="file-types">
29
+ <div class="file-type">PDF</div>
30
+ <div class="file-type">DOC</div>
31
+ <div class="file-type">DOCX</div>
32
+ <div class="file-type">TXT</div>
33
+ </div>
34
+ </div>
35
+
36
+ <div class="file-list" id="fileList">
37
+ <!-- File items will be added here dynamically -->
38
+ </div>
39
+
40
+ <div class="url-input">
41
+ <label for="urlInput"><i class="fas fa-link"></i> Or Enter a Web Page URL</label>
42
+ <input type="url" id="urlInput" placeholder="https://example.com/article">
43
+ </div>
44
+
45
+ <button class="btn btn-block pulse" id="processBtn">
46
+ <i class="fas fa-cogs"></i> Process Documents & URLs
47
+ </button>
48
+ </div>
49
+
50
+ <div class="chat-section">
51
+ <div class="chat-header">
52
+ <img src="data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 24 24'%3E%3Cpath fill='%234361ee' d='M12 2C6.48 2 2 6.48 2 12s4.48 10 10 10 10-4.48 10-10S17.52 2 12 2zm-1 15h-2v-6h2v6zm3 0h-2v-6h2v6zm3 0h-2v-6h2v6z'/%3E%3C/svg%3E" alt="AI Assistant">
53
+ <div>
54
+ <h2>ChatWithDoc Assistant</h2>
55
+ <p>Ask me anything about your documents</p>
56
+ </div>
57
+ </div>
58
+
59
+ <div class="chat-messages" id="chatMessages">
60
+ <div class="message bot-message">
61
+ <div class="message-header">
62
+ <i class="fas fa-robot"></i> ChatWithDoc Assistant
63
+ </div>
64
+ <div class="message-content">
65
+ Hello! I'm your document assistant. Upload some documents or enter URLs, then ask me anything about their content. I'll help you find answers quickly.
66
+ </div>
67
+ </div>
68
+ </div>
69
+
70
+
71
+
72
+ <div class="chat-input">
73
+ <input type="text" id="messageInput" placeholder="Ask about your documents...">
74
+ <button id="sendButton">
75
+ <i class="fas fa-paper-plane"></i>
76
+ </button>
77
+ </div>
78
+ </div>
79
+ </div>
80
+
81
+ <footer>
82
+ <p>ChatWithDoc - Chat with your documents using AI</p>
83
+ </footer>
84
+ </div>
85
+
86
+ <script src="js/main.js"></script>
87
+ </body>
88
+ </html>
frontend/js/main.js ADDED
@@ -0,0 +1,493 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // DOM Elements
2
+ const uploadArea = document.getElementById('uploadArea');
3
+ const fileInput = document.getElementById('fileInput');
4
+ const urlInput = document.getElementById('urlInput');
5
+ const processBtn = document.getElementById('processBtn');
6
+ const fileList = document.getElementById('fileList');
7
+ const chatMessages = document.getElementById('chatMessages');
8
+ const messageInput = document.getElementById('messageInput');
9
+ const sendButton = document.getElementById('sendButton');
10
+
11
+ // API Base URL
12
+ const API_BASE = '/';
13
+
14
+ console.log('JavaScript loaded successfully');
15
+
16
+ // Event Listeners
17
+ uploadArea.addEventListener('click', () => {
18
+ console.log('Upload area clicked');
19
+ fileInput.click();
20
+ });
21
+
22
+ fileInput.addEventListener('change', (e) => {
23
+ console.log('File input changed');
24
+ const files = e.target.files;
25
+ console.log('Files detected:', files.length); // Debug log
26
+
27
+ if (files.length > 0) {
28
+ console.log('Files selected:', files.length);
29
+
30
+ // Clear previous documents and UI first
31
+ clearPreviousDocuments();
32
+
33
+ // Store files in an array BEFORE clearing input
34
+ const fileArray = Array.from(files);
35
+ console.log('Files stored in array:', fileArray.length);
36
+
37
+ // Now process each file
38
+ fileArray.forEach((file, index) => {
39
+ console.log(`Processing file ${index + 1}:`, file.name, 'Type:', file.type);
40
+ uploadFile(file);
41
+ });
42
+ } else {
43
+ console.log('No files detected in change event');
44
+ }
45
+ });
46
+
47
+ processBtn.addEventListener('click', () => {
48
+ console.log('Process button clicked');
49
+ processAllDocuments();
50
+ });
51
+
52
+ sendButton.addEventListener('click', () => {
53
+ console.log('Send button clicked');
54
+ sendMessage();
55
+ });
56
+
57
+ messageInput.addEventListener('keypress', (e) => {
58
+ if (e.key === 'Enter') {
59
+ console.log('Enter key pressed');
60
+ sendMessage();
61
+ }
62
+ });
63
+
64
+ // Separate function for clearing previous documents (doesn't clear current input)
65
+ function clearPreviousDocuments() {
66
+ console.log('Clearing previous documents');
67
+
68
+ // Clear the file list UI
69
+ fileList.innerHTML = '';
70
+
71
+ // Clear URL input
72
+ urlInput.value = '';
73
+
74
+ // Clear chat messages and restore initial state
75
+ chatMessages.innerHTML = `
76
+ <div class="message bot-message">
77
+ <div class="message-header">
78
+ <i class="fas fa-robot"></i> ChatWithDoc Assistant
79
+ </div>
80
+ <div class="message-content">
81
+ Previous documents cleared. Ready for new uploads!
82
+ </div>
83
+ </div>
84
+ `;
85
+
86
+ // Call backend to clear previous documents
87
+ fetch(`${API_BASE}clear-documents`, {
88
+ method: 'POST',
89
+ headers: {
90
+ 'Content-Type': 'application/json'
91
+ }
92
+ })
93
+ .then(response => {
94
+ console.log('Clear documents response status:', response.status);
95
+ return response.json();
96
+ })
97
+ .then(data => {
98
+ console.log('Previous documents cleared:', data);
99
+ })
100
+ .catch(error => {
101
+ console.error('Error clearing documents:', error);
102
+ });
103
+ }
104
+
105
+ // Function for complete reset (used by clear button if you add one)
106
+ function clearAllFilesSync() {
107
+ console.log('Clearing all files completely');
108
+
109
+ // Clear the file input (only call this when you want to reset everything)
110
+ fileInput.value = '';
111
+
112
+ // Clear everything else
113
+ clearPreviousDocuments();
114
+ }
115
+
116
+ function uploadFile(file) {
117
+ console.log('Starting file upload for:', file.name);
118
+
119
+ const formData = new FormData();
120
+ formData.append('file', file);
121
+
122
+ // Show file in UI immediately
123
+ addFileToList(file.name, formatFileSize(file.size), 'uploading');
124
+
125
+ console.log('Making fetch request to:', `${API_BASE}upload`);
126
+
127
+ fetch(`${API_BASE}upload`, {
128
+ method: 'POST',
129
+ body: formData
130
+ })
131
+ .then(response => {
132
+ console.log('Upload response status:', response.status);
133
+ if (!response.ok) {
134
+ throw new Error(`HTTP error! status: ${response.status}`);
135
+ }
136
+ return response.json();
137
+ })
138
+ .then(data => {
139
+ console.log('Upload response data:', data);
140
+ if (data.error) {
141
+ updateFileStatus(file.name, 'error');
142
+ alert('Error uploading file: ' + data.error);
143
+ } else {
144
+ updateFileStatus(file.name, 'uploaded');
145
+ console.log('File uploaded successfully:', file.name);
146
+ }
147
+ })
148
+ .catch(error => {
149
+ console.error('Upload error:', error);
150
+ updateFileStatus(file.name, 'error');
151
+ alert('Error uploading file: ' + error.message);
152
+ });
153
+ }
154
+
155
+ function processAllDocuments() {
156
+ console.log('Processing all documents');
157
+
158
+ const url = urlInput.value.trim();
159
+ const files = document.querySelectorAll('.file-item');
160
+
161
+ console.log('URL:', url, 'Files count:', files.length);
162
+
163
+ if (files.length === 0 && !url) {
164
+ alert('Please upload files or enter a URL first');
165
+ return;
166
+ }
167
+
168
+ // Show processing animation
169
+ showProcessing();
170
+
171
+ // Process all uploaded files
172
+ let filePromise = Promise.resolve();
173
+ if (files.length > 0) {
174
+ // Update status to processing
175
+ files.forEach(fileItem => {
176
+ const fileName = fileItem.dataset.filename;
177
+ updateFileStatus(fileName, 'processing');
178
+ });
179
+
180
+ console.log('Calling process-documents endpoint');
181
+
182
+ filePromise = fetch(`${API_BASE}process-documents`, {
183
+ method: 'POST',
184
+ headers: {
185
+ 'Content-Type': 'application/json'
186
+ }
187
+ })
188
+ .then(response => {
189
+ console.log('Process documents response status:', response.status);
190
+ if (!response.ok) {
191
+ throw new Error(`HTTP error! status: ${response.status}`);
192
+ }
193
+ return response.json();
194
+ })
195
+ .then(data => {
196
+ console.log('Process documents response:', data);
197
+ if (data.error) {
198
+ throw new Error(data.error);
199
+ }
200
+ // Mark all files as processed
201
+ files.forEach(fileItem => {
202
+ const fileName = fileItem.dataset.filename;
203
+ updateFileStatus(fileName, 'processed');
204
+ });
205
+ addBotMessage(`Successfully processed ${data.processed_count} files!`);
206
+ return data;
207
+ });
208
+ }
209
+
210
+ // Process URL if provided
211
+ let urlPromise = Promise.resolve();
212
+ if (url) {
213
+ console.log('Processing URL:', url);
214
+
215
+ urlPromise = fetch(`${API_BASE}process-url`, {
216
+ method: 'POST',
217
+ headers: {
218
+ 'Content-Type': 'application/json'
219
+ },
220
+ body: JSON.stringify({ url: url })
221
+ })
222
+ .then(response => {
223
+ console.log('Process URL response status:', response.status);
224
+ return response.json();
225
+ })
226
+ .then(data => {
227
+ console.log('Process URL response:', data);
228
+ if (data.error) {
229
+ throw new Error(data.error);
230
+ }
231
+ addBotMessage(`URL processed successfully! Found ${data.document_info.num_pages} pages with ${data.document_info.num_chunks} text chunks.`);
232
+ return data;
233
+ });
234
+ }
235
+
236
+ // Wait for all processing to complete
237
+ Promise.all([filePromise, urlPromise])
238
+ .then(() => {
239
+ console.log('All processing completed');
240
+ hideProcessing();
241
+ addBotMessage("All documents and URLs have been processed successfully! You can now ask questions about them.");
242
+ })
243
+ .catch(error => {
244
+ console.error('Processing error:', error);
245
+ hideProcessing();
246
+ alert('Error processing documents: ' + error.message);
247
+ });
248
+ }
249
+
250
+ function sendMessage() {
251
+ const message = messageInput.value.trim();
252
+ console.log('Sending message:', message);
253
+
254
+ if (message) {
255
+ addUserMessage(message);
256
+ messageInput.value = '';
257
+
258
+ // Show typing indicator
259
+ showTypingIndicator();
260
+
261
+ fetch(`${API_BASE}chat`, {
262
+ method: 'POST',
263
+ headers: {
264
+ 'Content-Type': 'application/json'
265
+ },
266
+ body: JSON.stringify({ message: message })
267
+ })
268
+ .then(response => {
269
+ console.log('Chat response status:', response.status);
270
+ return response.json();
271
+ })
272
+ .then(data => {
273
+ console.log('Chat response:', data);
274
+ hideTypingIndicator();
275
+ if (data.error) {
276
+ addBotMessage("Sorry, I encountered an error: " + data.error);
277
+ } else {
278
+ addBotMessage(data.response);
279
+ }
280
+ })
281
+ .catch(error => {
282
+ console.error('Chat error:', error);
283
+ hideTypingIndicator();
284
+ addBotMessage("Sorry, I encountered an error processing your request.");
285
+ });
286
+ }
287
+ }
288
+
289
+ function addFileToList(name, size, status = 'success') {
290
+ console.log('Adding file to list:', name, 'Status:', status);
291
+
292
+ const fileItem = document.createElement('div');
293
+ fileItem.className = 'file-item';
294
+ fileItem.dataset.filename = name;
295
+
296
+ let statusIcon = '';
297
+ if (status === 'uploading') {
298
+ statusIcon = '<i class="fas fa-spinner fa-spin"></i>';
299
+ } else if (status === 'processing') {
300
+ statusIcon = '<i class="fas fa-cog fa-spin"></i>';
301
+ } else if (status === 'error') {
302
+ statusIcon = '<i class="fas fa-exclamation-circle" style="color: var(--danger);"></i>';
303
+ } else if (status === 'processed') {
304
+ statusIcon = '<i class="fas fa-check-circle" style="color: var(--success);"></i>';
305
+ } else if (status === 'uploaded') {
306
+ statusIcon = '<i class="fas fa-file-alt"></i>';
307
+ } else {
308
+ statusIcon = '<i class="fas fa-file-alt"></i>';
309
+ }
310
+
311
+ fileItem.innerHTML = `
312
+ ${statusIcon}
313
+ <div class="file-info">
314
+ <div class="file-name">${name}</div>
315
+ <div class="file-size">${size}</div>
316
+ </div>
317
+ <div class="file-actions">
318
+ <button title="Remove"><i class="fas fa-times"></i></button>
319
+ </div>
320
+ `;
321
+
322
+ fileList.appendChild(fileItem);
323
+
324
+ // Add remove functionality
325
+ fileItem.querySelector('.file-actions button').addEventListener('click', () => {
326
+ console.log('Removing file:', name);
327
+ fileItem.remove();
328
+ });
329
+ }
330
+
331
+ function updateFileStatus(name, status) {
332
+ console.log('Updating file status:', name, 'to', status);
333
+
334
+ const fileItems = document.querySelectorAll('.file-item');
335
+ fileItems.forEach(item => {
336
+ if (item.dataset.filename === name) {
337
+ let statusIcon = '';
338
+ if (status === 'uploading') {
339
+ statusIcon = '<i class="fas fa-spinner fa-spin"></i>';
340
+ } else if (status === 'processing') {
341
+ statusIcon = '<i class="fas fa-cog fa-spin"></i>';
342
+ } else if (status === 'error') {
343
+ statusIcon = '<i class="fas fa-exclamation-circle" style="color: var(--danger);"></i>';
344
+ } else if (status === 'processed') {
345
+ statusIcon = '<i class="fas fa-check-circle" style="color: var(--success);"></i>';
346
+ } else if (status === 'uploaded') {
347
+ statusIcon = '<i class="fas fa-file-alt"></i>';
348
+ } else {
349
+ statusIcon = '<i class="fas fa-file-alt"></i>';
350
+ }
351
+ const iconElement = item.querySelector('i');
352
+ if (iconElement) {
353
+ iconElement.outerHTML = statusIcon;
354
+ }
355
+ }
356
+ });
357
+ }
358
+
359
+ function formatFileSize(bytes) {
360
+ if (bytes === 0) return '0 Bytes';
361
+ const k = 1024;
362
+ const sizes = ['Bytes', 'KB', 'MB', 'GB'];
363
+ const i = Math.floor(Math.log(bytes) / Math.log(k));
364
+ return parseFloat((bytes / Math.pow(k, i)).toFixed(2)) + ' ' + sizes[i];
365
+ }
366
+
367
+ function addUserMessage(text) {
368
+ const messageDiv = document.createElement('div');
369
+ messageDiv.className = 'message user-message';
370
+ messageDiv.innerHTML = `
371
+ <div class="message-header">
372
+ <i class="fas fa-user"></i> You
373
+ </div>
374
+ <div class="message-content">${text}</div>
375
+ `;
376
+ chatMessages.appendChild(messageDiv);
377
+ chatMessages.scrollTop = chatMessages.scrollHeight;
378
+ }
379
+
380
+ function addBotMessage(text) {
381
+ const messageDiv = document.createElement('div');
382
+ // Convert markdown to HTML
383
+ const markdownHTML = DOMPurify.sanitize(marked.parse(text));
384
+ messageDiv.className = 'message bot-message';
385
+ messageDiv.innerHTML = `
386
+ <div class="message-header">
387
+ <i class="fas fa-robot"></i> ChatWithDoc Assistant
388
+ </div>
389
+ <div class="message-content">${markdownHTML}</div>
390
+ `;
391
+ chatMessages.appendChild(messageDiv);
392
+ chatMessages.scrollTop = chatMessages.scrollHeight;
393
+ }
394
+
395
+ function showTypingIndicator() {
396
+ const typingIndicator = document.createElement('div');
397
+ typingIndicator.className = 'typing-indicator';
398
+ typingIndicator.id = 'typingIndicator';
399
+ typingIndicator.innerHTML = `
400
+ <div class="typing-dot"></div>
401
+ <div class="typing-dot"></div>
402
+ <div class="typing-dot"></div>
403
+ `;
404
+ chatMessages.appendChild(typingIndicator);
405
+ chatMessages.scrollTop = chatMessages.scrollHeight;
406
+ }
407
+
408
+ function hideTypingIndicator() {
409
+ const typingIndicator = document.getElementById('typingIndicator');
410
+ if (typingIndicator) {
411
+ typingIndicator.remove();
412
+ }
413
+ }
414
+
415
+ function showProcessing() {
416
+ console.log('Showing processing indicator');
417
+ const processingDiv = document.createElement('div');
418
+ processingDiv.className = 'processing';
419
+ processingDiv.id = 'processingIndicator';
420
+ processingDiv.innerHTML = '<i class="fas fa-spinner fa-spin"></i> Processing documents and URLs...';
421
+
422
+ // Replace chat messages with processing indicator
423
+ chatMessages.innerHTML = '';
424
+ chatMessages.appendChild(processingDiv);
425
+ }
426
+
427
+ function hideProcessing() {
428
+ console.log('Hiding processing indicator');
429
+ const processingIndicator = document.getElementById('processingIndicator');
430
+ if (processingIndicator) {
431
+ processingIndicator.remove();
432
+ }
433
+
434
+ // Restore initial message
435
+ chatMessages.innerHTML = `
436
+ <div class="message bot-message">
437
+ <div class="message-header">
438
+ <i class="fas fa-robot"></i> ChatWithDoc Assistant
439
+ </div>
440
+ <div class="message-content">
441
+ Hello! I'm your document assistant. Upload some documents or enter URLs, then ask me anything about their content. I'll help you find answers quickly.
442
+ </div>
443
+ </div>
444
+ `;
445
+ }
446
+
447
+ // Drag and drop functionality
448
+ uploadArea.addEventListener('dragover', (e) => {
449
+ e.preventDefault();
450
+ console.log('Drag over upload area');
451
+ uploadArea.style.borderColor = 'var(--primary)';
452
+ uploadArea.style.backgroundColor = 'rgba(67, 97, 238, 0.1)';
453
+ });
454
+
455
+ uploadArea.addEventListener('dragleave', () => {
456
+ console.log('Drag leave upload area');
457
+ uploadArea.style.borderColor = 'var(--light-gray)';
458
+ uploadArea.style.backgroundColor = '';
459
+ });
460
+
461
+ uploadArea.addEventListener('drop', (e) => {
462
+ e.preventDefault();
463
+ console.log('Files dropped on upload area');
464
+ uploadArea.style.borderColor = 'var(--light-gray)';
465
+ uploadArea.style.backgroundColor = '';
466
+
467
+ const files = e.dataTransfer.files;
468
+ console.log('Dropped files count:', files.length);
469
+
470
+ if (files.length > 0) {
471
+ // Clear previous documents first
472
+ clearPreviousDocuments();
473
+
474
+ // Store files in array before processing
475
+ const fileArray = Array.from(files);
476
+
477
+ // Process each file
478
+ fileArray.forEach(file => {
479
+ console.log('Processing dropped file:', file.name, 'Type:', file.type);
480
+ if (file.type === 'application/pdf' ||
481
+ file.type === 'application/msword' ||
482
+ file.type === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' ||
483
+ file.type === 'text/plain') {
484
+ uploadFile(file);
485
+ } else {
486
+ console.log('Unsupported file type:', file.type);
487
+ alert(`Unsupported file type: ${file.type}. Please upload PDF, DOC, DOCX, or TXT files.`);
488
+ }
489
+ });
490
+ }
491
+ });
492
+
493
+ console.log('All event listeners attached successfully');
main.py ADDED
@@ -0,0 +1,269 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, File, UploadFile
2
+ from fastapi.responses import JSONResponse, FileResponse
3
+ from fastapi.staticfiles import StaticFiles
4
+ from fastapi.middleware.cors import CORSMiddleware
5
+ import shutil
6
+ import os
7
+ from pydantic import BaseModel, Field
8
+ from typing import Dict, Any, List
9
+ from task_manager import DocumentManager
10
+ import warnings
11
+
12
+ # Disable all LangSmith related warnings
13
+ warnings.filterwarnings("ignore", message=".*LangSmith.*")
14
+ warnings.filterwarnings("ignore", message=".*API key.*")
15
+
16
+ # Also disable UserAgent warning
17
+ os.environ["LANGCHAIN_USER_AGENT"] = "ChatWithDoc/1.0"
18
+
19
+ app = FastAPI()
20
+
21
+ # Initialize document manager
22
+ doc_manager = DocumentManager()
23
+
24
+ # Store uploaded files temporarily before processing
25
+ uploaded_files = []
26
+
27
+ class UploadResponse(BaseModel):
28
+ message: str
29
+ document_info: Dict[str, Any]
30
+
31
+ class URLRequest(BaseModel):
32
+ url: str = Field(..., description="URL of the document to process")
33
+
34
+ class ChatRequest(BaseModel):
35
+ message: str = Field(..., description="User's question")
36
+
37
+ class ChatResponse(BaseModel):
38
+ response: str = Field(..., description="Answer to the user's question")
39
+
40
+ class ProcessResponse(BaseModel):
41
+ message: str
42
+ processed_count: int
43
+ errors: List[str] = []
44
+
45
+ # Allow CORS (update this with your frontend URL in production)
46
+ app.add_middleware(
47
+ CORSMiddleware,
48
+ allow_origins=["*"], # Change to your React frontend URL in prod
49
+ allow_credentials=True,
50
+ allow_methods=["*"],
51
+ allow_headers=["*"],
52
+ )
53
+
54
+ UPLOAD_DIR = "uploaded_files"
55
+ os.makedirs(UPLOAD_DIR, exist_ok=True)
56
+
57
+ @app.post("/upload")
58
+ async def upload_file(file: UploadFile = File(...)):
59
+ """
60
+ Upload a document (just stores it, doesn't process yet)
61
+
62
+ - **file**: The document file to upload (PDF, DOCX, TXT)
63
+ - Returns upload confirmation
64
+ """
65
+ print(f"Received file: {file.filename} of type {file.content_type}")
66
+
67
+ # Get file extension and determine content type
68
+ file_extension = file.filename.lower().split('.')[-1] if '.' in file.filename else ''
69
+
70
+ # Map file extensions to content types
71
+ extension_to_type = {
72
+ 'pdf': 'application/pdf',
73
+ 'txt': 'text/plain',
74
+ 'doc': 'application/msword',
75
+ 'docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
76
+ }
77
+
78
+ # Use file extension to determine content type if content_type is not reliable
79
+ if file.content_type and file.content_type in ["application/pdf", "text/plain", "application/msword",
80
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document"]:
81
+ content_type = file.content_type
82
+ elif file_extension in extension_to_type:
83
+ content_type = extension_to_type[file_extension]
84
+ else:
85
+ return JSONResponse(status_code=400, content={"error": f"Unsupported file type: {file_extension}"})
86
+
87
+ print(f"Using content type: {content_type}")
88
+
89
+ file_location = os.path.join(UPLOAD_DIR, file.filename)
90
+
91
+ # Save the file
92
+ with open(file_location, "wb") as buffer:
93
+ shutil.copyfileobj(file.file, buffer)
94
+
95
+ # Store file info for later processing
96
+ file_info = {
97
+ "filename": file.filename,
98
+ "file_location": file_location,
99
+ "content_type": content_type
100
+ }
101
+ uploaded_files.append(file_info)
102
+
103
+ print("File uploaded successfully, ready for processing")
104
+ return UploadResponse(
105
+ message="File uploaded successfully",
106
+ document_info={
107
+ "filename": file.filename,
108
+ "content_type": content_type,
109
+ "status": "uploaded",
110
+ "location": file_location
111
+ }
112
+ )
113
+
114
+ @app.post("/process-documents")
115
+ async def process_documents():
116
+ """
117
+ Process all uploaded files using your processor architecture
118
+ """
119
+ try:
120
+ if not uploaded_files:
121
+ return JSONResponse(status_code=400, content={"error": "No files uploaded"})
122
+
123
+ processed_count = 0
124
+ errors = []
125
+
126
+ # Process each uploaded file
127
+ for file_info in uploaded_files:
128
+ try:
129
+ result = doc_manager.process_document(file_info["file_location"], file_info["content_type"])
130
+
131
+ if result["status"] == "success":
132
+ processed_count += 1
133
+ print(f"Successfully processed: {file_info['filename']}")
134
+ else:
135
+ error_msg = f"{file_info['filename']}: {result['message']}"
136
+ errors.append(error_msg)
137
+ print(f"Failed to process {file_info['filename']}: {result['message']}")
138
+
139
+ except Exception as e:
140
+ error_msg = f"{file_info['filename']}: {str(e)}"
141
+ errors.append(error_msg)
142
+ print(f"Exception processing {file_info['filename']}: {e}")
143
+
144
+ # Clear uploaded files list after processing attempt
145
+ uploaded_files.clear()
146
+
147
+ if processed_count == 0:
148
+ return JSONResponse(status_code=400, content={
149
+ "error": f"Failed to process any files. Errors: {'; '.join(errors)}"
150
+ })
151
+
152
+ response_message = f"Successfully processed {processed_count} files"
153
+ if errors:
154
+ response_message += f". {len(errors)} files had errors."
155
+
156
+ return ProcessResponse(
157
+ message=response_message,
158
+ processed_count=processed_count,
159
+ errors=errors
160
+ )
161
+
162
+ except Exception as e:
163
+ return JSONResponse(status_code=500, content={"error": str(e)})
164
+
165
+ @app.post("/process-url")
166
+ async def process_url(url_request: URLRequest):
167
+ """
168
+ Process a document from URL using your web processor
169
+
170
+ - **url**: The URL of the document to process
171
+ - Returns document processing information
172
+ """
173
+ url = url_request.url
174
+
175
+ try:
176
+ # Process the URL using your web processor
177
+ result = doc_manager.process_url(url)
178
+ print("URL processing result:", result)
179
+
180
+ if result["status"] == "error":
181
+ return JSONResponse(status_code=400, content={"error": result["message"]})
182
+
183
+ return UploadResponse(
184
+ message="URL processed successfully",
185
+ document_info={
186
+ "url": url,
187
+ "status": "processed",
188
+ "type": "url",
189
+ "title": result.get("title", "Untitled"),
190
+ "num_pages": result.get("num_pages", 0),
191
+ "num_chunks": result.get("num_chunks", 0),
192
+ "word_count": result.get("word_count", 0)
193
+ }
194
+ )
195
+
196
+ except Exception as e:
197
+ return JSONResponse(status_code=500, content={"error": str(e)})
198
+
199
+ @app.post("/chat")
200
+ async def chat_with_doc(chat_request: ChatRequest):
201
+ """
202
+ Process a query against processed documents using your processors
203
+
204
+ - **query**: The user's question
205
+ - Returns an answer
206
+ """
207
+ try:
208
+ print(f"Received query: {chat_request.message}")
209
+ result = doc_manager.query_document(chat_request.message)
210
+ print("Query result:", result)
211
+
212
+ if result["status"] == "error":
213
+ return JSONResponse(status_code=400, content={"error": result["message"]})
214
+
215
+ return ChatResponse(
216
+ response=result["answer"]
217
+ )
218
+ except Exception as e:
219
+ return JSONResponse(status_code=500, content={"error": str(e)})
220
+
221
+ @app.post("/clear-documents")
222
+ async def clear_documents():
223
+ """
224
+ Clear all previously processed documents and uploaded files
225
+ """
226
+ print("Clearing all documents...")
227
+ try:
228
+ doc_manager.clear_documents()
229
+ uploaded_files.clear()
230
+ return {"message": "Documents cleared successfully"}
231
+ except Exception as e:
232
+ return JSONResponse(status_code=500, content={"error": str(e)})
233
+
234
+ @app.get("/status")
235
+ async def get_status():
236
+ """
237
+ Get current status of uploaded and processed documents
238
+ """
239
+ try:
240
+ # Get status from your document manager if it has a get_status method
241
+ if hasattr(doc_manager, 'get_status'):
242
+ doc_status = doc_manager.get_status()
243
+ else:
244
+ # Fallback for original single-document architecture
245
+ doc_status = {
246
+ "total_documents": 1 if hasattr(doc_manager, 'current_processor') and doc_manager.current_processor else 0,
247
+ "current_document": getattr(doc_manager, 'current_document', None)
248
+ }
249
+
250
+ return {
251
+ "uploaded_files": len(uploaded_files),
252
+ "status": doc_status
253
+ }
254
+ except Exception as e:
255
+ return JSONResponse(status_code=500, content={"error": str(e)})
256
+
257
+ # Health check endpoint
258
+ @app.get("/health")
259
+ async def health_check():
260
+ """Simple health check"""
261
+ return {"status": "healthy", "message": "ChatWithDoc API is running"}
262
+
263
+ # Mount the frontend directory as a static path.
264
+ # This should be after all API routes to ensure they are not overridden.
265
+ app.mount("/", StaticFiles(directory="frontend", html=True), name="frontend")
266
+
267
+ if __name__ == "__main__":
268
+ import uvicorn
269
+ uvicorn.run(app, host="0.0.0.0", port=7860)
pdfHandler.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_community.document_loaders import PyPDFLoader
2
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
3
+ from langchain.chat_models import init_chat_model
4
+ from langchain_huggingface import HuggingFaceEmbeddings
5
+ import faiss
6
+ from langchain_community.docstore.in_memory import InMemoryDocstore
7
+ from langchain_community.vectorstores import FAISS
8
+ import os
9
+ from langchain import hub
10
+ from dotenv import load_dotenv
11
+ from langgraph.graph import START, StateGraph
12
+ from typing import List, Dict, Any, Optional
13
+ from pydantic import BaseModel, Field
14
+ from langchain.docstore.document import Document
15
+
16
+ load_dotenv()
17
+
18
+ class State(BaseModel):
19
+ question: str = Field(..., description="Type your question here")
20
+ context: List[Document] = Field(
21
+ default_factory=list,
22
+ description="A list of Document objects",
23
+ )
24
+ answer: str = Field(default="", description="Answer will be here")
25
+
26
+ class PDFProcessor:
27
+ def __init__(self):
28
+ # Load model provider
29
+ if not os.environ.get("GOOGLE_API_KEY"):
30
+ raise ValueError("Google Gemini API key not found in environment variables")
31
+
32
+ self.llm = init_chat_model("gemini-2.5-flash", model_provider="google_genai")
33
+ self.embedding_model = HuggingFaceEmbeddings(
34
+ model_name="sentence-transformers/all-MiniLM-L6-v2",
35
+ model_kwargs={"device": "cpu"}
36
+ )
37
+ self.prompt = hub.pull("rlm/rag-prompt")
38
+ self.vector_store = None
39
+ self.chunk_size = 1000
40
+ self.chunk_overlap = 200
41
+
42
+ def process_pdf(self, file_path: str) -> Dict[str, Any]:
43
+ """
44
+ Process a PDF file and prepare it for querying
45
+
46
+ Args:
47
+ file_path (str): Path to the PDF file
48
+
49
+ Returns:
50
+ Dict[str, Any]: Processing status and information
51
+ """
52
+ try:
53
+ print(f"Processing PDF file: {file_path}")
54
+ # Document Loading
55
+ loader = PyPDFLoader(file_path)
56
+ pages = loader.load()
57
+
58
+ # Text Splitting
59
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap)
60
+ texts = text_splitter.split_documents(pages)
61
+
62
+ # Vector Store Setup
63
+ embedding_dim = len(self.embedding_model.embed_query("test"))
64
+ index = faiss.IndexFlatL2(embedding_dim)
65
+
66
+ self.vector_store = FAISS(
67
+ embedding_function=self.embedding_model,
68
+ index=index,
69
+ docstore=InMemoryDocstore(),
70
+ index_to_docstore_id={},
71
+ )
72
+
73
+ # Index chunks
74
+ self.vector_store.add_documents(documents=texts)
75
+
76
+ return {
77
+ "status": "success",
78
+ "message": "PDF processed successfully",
79
+ "num_pages": len(pages),
80
+ "num_chunks": len(texts)
81
+ }
82
+ except Exception as e:
83
+ return {
84
+ "status": "error",
85
+ "message": f"Error processing PDF: {str(e)}"
86
+ }
87
+
88
+ def query_response(self, query: str) -> Dict[str, Any]:
89
+ """
90
+ Query the processed document
91
+
92
+ Args:
93
+ query (str): The question to ask about the document
94
+
95
+ Returns:
96
+ Dict[str, Any]: Answer and relevant context
97
+ """
98
+ if not self.vector_store:
99
+ return {
100
+ "status": "error",
101
+ "message": "No document has been processed yet"
102
+ }
103
+
104
+ try:
105
+ # Create state graph
106
+ graph_builder = StateGraph(State)
107
+
108
+ # Define retrieval step
109
+ def retrieve(state: State):
110
+ retrieved_docs = self.vector_store.similarity_search(state.question)
111
+ return {"context": retrieved_docs}
112
+
113
+ # Define generation step
114
+ def generate(state: State):
115
+ docs_content = "\n\n".join(doc.page_content for doc in state.context)
116
+ messages = self.prompt.invoke({
117
+ "question": state.question,
118
+ "context": docs_content
119
+ })
120
+ response = self.llm.invoke(messages)
121
+ return {"answer": response.content}
122
+
123
+ # Build and compile the graph
124
+ graph = graph_builder.add_sequence([retrieve, generate]).set_entry_point("retrieve").compile()
125
+
126
+ # Execute the query
127
+ response = graph.invoke({
128
+ "question": query
129
+ })
130
+
131
+ return {
132
+ "status": "success",
133
+ "answer": response["answer"],
134
+ "query": query
135
+ }
136
+ except Exception as e:
137
+ return {
138
+ "status": "error",
139
+ "message": f"Error querying document: {str(e)}"
140
+ }
141
+
requirements.txt ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core LangChain dependencies
2
+ langchain
3
+ langchain-community
4
+ langchain-core
5
+ langchain-text-splitters
6
+ langchain-google-genai
7
+ langgraph
8
+ grandalf
9
+
10
+ # FastAPI and web framework
11
+ fastapi
12
+ uvicorn[standard]
13
+ python-multipart
14
+ pydantic
15
+
16
+ # Document processing
17
+ PyPDF2
18
+ python-docx
19
+ docx2txt
20
+ unstructured
21
+ beautifulsoup4
22
+ requests
23
+
24
+ # Embeddings and vector stores - Fixed versions
25
+ sentence-transformers
26
+ faiss-cpu
27
+ numpy
28
+ torch
29
+ transformers
30
+ huggingface-hub
31
+
32
+ # Additional utilities
33
+ python-magic
34
+ python-magic-bin
35
+ pypdf
36
+ lxml
37
+
38
+
39
+
40
+ # Web scraping
41
+ requests
42
+ beautifulsoup4
task_manager.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pdfHandler import PDFProcessor
2
+ from docHandler import DocProcessor
3
+ from txtHandler import TextProcessor
4
+ from webHandler import WebProcessor
5
+ from typing import Dict, Any, List
6
+
7
+ class DocumentManager:
8
+ def __init__(self):
9
+ self.pdf_processor = PDFProcessor()
10
+ self.doc_processor = DocProcessor()
11
+ self.txt_processor = TextProcessor()
12
+ self.web_processor = WebProcessor()
13
+
14
+ # Store multiple processed documents
15
+ self.processed_documents = [] # List of {"processor": processor, "file_path": path, "content_type": type}
16
+ self.all_content = "" # Combined content for multi-document queries
17
+
18
+ def process_document(self, file_path: str, content_type: str) -> Dict[str, Any]:
19
+ try:
20
+ result = {"status": "error", "message": "Unknown file type"}
21
+ processor = None
22
+
23
+ print(f"Processing file: {file_path} with content type: {content_type}")
24
+
25
+ if content_type == "application/pdf":
26
+ result = self.pdf_processor.process_pdf(file_path)
27
+ processor = self.pdf_processor
28
+ elif content_type == "application/msword":
29
+ result = self.doc_processor.process_docx(file_path)
30
+ processor = self.doc_processor
31
+ elif content_type == "text/plain":
32
+ result = self.txt_processor.process_text(file_path)
33
+ processor = self.txt_processor
34
+ elif content_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
35
+ result = self.doc_processor.process_docx(file_path)
36
+ processor = self.doc_processor
37
+
38
+ if result["status"] == "success" and processor:
39
+ # Add to processed documents list
40
+ doc_info = {
41
+ "processor": processor,
42
+ "file_path": file_path,
43
+ "content_type": content_type,
44
+ "filename": file_path.split('/')[-1] # Extract filename
45
+ }
46
+ self.processed_documents.append(doc_info)
47
+
48
+ # Update combined content for multi-document queries
49
+ # Assuming processors have a method to get content
50
+ try:
51
+ if hasattr(processor, 'get_content'):
52
+ content = processor.get_content()
53
+ self.all_content += f"\n\n--- Document: {doc_info['filename']} ---\n{content}"
54
+ except:
55
+ pass
56
+
57
+ print(f"Document added to collection. Total documents: {len(self.processed_documents)}")
58
+
59
+ return result
60
+ except Exception as e:
61
+ return {"status": "error", "message": str(e)}
62
+
63
+ def query_document(self, query: str) -> Dict[str, Any]:
64
+ if not self.processed_documents:
65
+ return {"status": "error", "message": "No documents processed"}
66
+
67
+ print(f"Querying {len(self.processed_documents)} documents with question: {query}")
68
+
69
+ try:
70
+ # Strategy 1: Try to query each document and combine results
71
+ all_responses = []
72
+
73
+ for i, doc_info in enumerate(self.processed_documents):
74
+ processor = doc_info["processor"]
75
+ filename = doc_info["filename"]
76
+ just_filename = filename.split('\\')[-1]
77
+
78
+ # Query individual document
79
+ try:
80
+ response = processor.query_response(query)
81
+ if response.get("status") == "success":
82
+ answer = response.get("answer", "")
83
+ if answer and answer.strip():
84
+ all_responses.append(f"From {just_filename}:\n {answer}")
85
+ except Exception as e:
86
+ print(f"Error querying {filename}: {e}")
87
+ continue
88
+
89
+ if not all_responses:
90
+ return {"status": "error", "message": "No relevant information found in any documents"}
91
+
92
+ # Combine all responses
93
+ combined_answer = "\n\n".join(all_responses)
94
+
95
+ return {
96
+ "status": "success",
97
+ "answer": combined_answer
98
+ }
99
+
100
+ except Exception as e:
101
+ # Fallback: Use the last processed document
102
+ print(f"Multi-document query failed, using last document: {e}")
103
+ last_processor = self.processed_documents[-1]["processor"]
104
+ return last_processor.query_response(query)
105
+
106
+ def clear_documents(self):
107
+ """Clear all previously processed documents"""
108
+ self.processed_documents = []
109
+ self.all_content = ""
110
+ print("All documents cleared - ready for new uploads")
111
+
112
+ def process_url(self, url: str) -> Dict[str, Any]:
113
+ """Process a URL and add it to the document collection"""
114
+ try:
115
+ result = self.web_processor.process_url(url)
116
+ if result["status"] == "success":
117
+ # Add URL to processed documents
118
+ doc_info = {
119
+ "processor": self.web_processor,
120
+ "file_path": url,
121
+ "content_type": "text/html",
122
+ "filename": f"webpage_{url.split('/')[-1] or 'index'}"
123
+ }
124
+ self.processed_documents.append(doc_info)
125
+
126
+ # Update combined content
127
+ try:
128
+ if hasattr(self.web_processor, 'get_content'):
129
+ content = self.web_processor.get_content()
130
+ self.all_content += f"\n\n--- Web Page: {url} ---\n{content}"
131
+ except:
132
+ pass
133
+
134
+ print(f"URL processed and added to collection: {url}")
135
+ return result
136
+ except Exception as e:
137
+ return {"status": "error", "message": str(e)}
138
+
139
+ def get_status(self) -> Dict[str, Any]:
140
+ """Get current status of processed documents"""
141
+ return {
142
+ "total_documents": len(self.processed_documents),
143
+ "document_types": list(set([doc["content_type"] for doc in self.processed_documents])),
144
+ "filenames": [doc["filename"] for doc in self.processed_documents]
145
+ }
txtHandler.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_community.document_loaders import TextLoader
2
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
3
+ from langchain.chat_models import init_chat_model
4
+ from langchain_huggingface import HuggingFaceEmbeddings
5
+ import faiss
6
+ from langchain_community.docstore.in_memory import InMemoryDocstore
7
+ from langchain_community.vectorstores import FAISS
8
+ import os
9
+ from langchain import hub
10
+ from dotenv import load_dotenv
11
+ from langgraph.graph import START, StateGraph
12
+ from typing import List, Dict, Any, Optional
13
+ from pydantic import BaseModel, Field
14
+ from langchain.docstore.document import Document
15
+
16
+ load_dotenv()
17
+
18
+ class State(BaseModel):
19
+ question: str = Field(..., description="Type your question here")
20
+ context: List[Document] = Field(
21
+ default_factory=list,
22
+ description="A list of Document objects",
23
+ )
24
+ answer: str = Field(default="", description="Answer will be here")
25
+
26
+ class TextProcessor:
27
+ def __init__(self):
28
+ # Load model provider
29
+ if not os.environ.get("GOOGLE_API_KEY"):
30
+ raise ValueError("Google Gemini API key not found in environment variables")
31
+
32
+ self.llm = init_chat_model("gemini-2.5-flash", model_provider="google_genai")
33
+ self.embedding_model = HuggingFaceEmbeddings(
34
+ model_name="sentence-transformers/all-MiniLM-L6-v2",
35
+ model_kwargs={"device": "cpu"}
36
+ )
37
+ self.prompt = hub.pull("rlm/rag-prompt")
38
+ self.vector_store = None
39
+ self.chunk_size = 1000
40
+ self.chunk_overlap = 200
41
+
42
+
43
+ def process_text(self, file_path: str) -> Dict[str, Any]:
44
+ """
45
+ Process a text file and prepare it for querying
46
+
47
+ Args:
48
+ file_path (str): Path to the text file
49
+
50
+ Returns:
51
+ Dict[str, Any]: Processing status and information
52
+ """
53
+ try:
54
+ # Document Loading
55
+ loader = TextLoader(file_path, encoding='utf-8')
56
+ pages = loader.load()
57
+
58
+ # Text Splitting
59
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap)
60
+ texts = text_splitter.split_documents(pages)
61
+
62
+ # Vector Store Setup
63
+ embedding_dim = len(self.embedding_model.embed_query("test"))
64
+ index = faiss.IndexFlatL2(embedding_dim)
65
+
66
+ self.vector_store = FAISS(
67
+ embedding_function=self.embedding_model,
68
+ index=index,
69
+ docstore=InMemoryDocstore(),
70
+ index_to_docstore_id={},
71
+ )
72
+
73
+ # Index chunks
74
+ self.vector_store.add_documents(documents=texts)
75
+
76
+ return {
77
+ "status": "success",
78
+ "message": "Text file processed successfully",
79
+ "num_pages": len(pages),
80
+ "num_chunks": len(texts)
81
+ }
82
+ except Exception as e:
83
+ return {
84
+ "status": "error",
85
+ "message": f"Error processing text file: {str(e)}"
86
+ }
87
+
88
+ def query_response(self, query: str) -> Dict[str, Any]:
89
+ """
90
+ Query the processed document
91
+
92
+ Args:
93
+ query (str): The question to ask about the document
94
+
95
+ Returns:
96
+ Dict[str, Any]: Answer and relevant context
97
+ """
98
+ if not self.vector_store:
99
+ return {
100
+ "status": "error",
101
+ "message": "No document has been processed yet"
102
+ }
103
+
104
+ try:
105
+ # Create state graph
106
+ graph_builder = StateGraph(State)
107
+
108
+ # Define retrieval step
109
+ def retrieve(state: State):
110
+ retrieved_docs = self.vector_store.similarity_search(state.question)
111
+ return {"context": retrieved_docs}
112
+
113
+ # Define generation step
114
+ def generate(state: State):
115
+ docs_content = "\n\n".join(doc.page_content for doc in state.context)
116
+ messages = self.prompt.invoke({
117
+ "question": state.question,
118
+ "context": docs_content
119
+ })
120
+ response = self.llm.invoke(messages)
121
+ return {"answer": response.content}
122
+
123
+ # Build and compile the graph
124
+ graph = graph_builder.add_sequence([retrieve, generate]).set_entry_point("retrieve").compile()
125
+
126
+ # Execute the query
127
+ response = graph.invoke({
128
+ "question": query
129
+ })
130
+
131
+ return {
132
+ "status": "success",
133
+ "answer": response["answer"],
134
+ "query": query
135
+ }
136
+ except Exception as e:
137
+ return {
138
+ "status": "error",
139
+ "message": f"Error querying document: {str(e)}"
140
+ }
141
+
webHandler.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ from typing import Dict, Any
4
+
5
+ class WebProcessor:
6
+ def __init__(self):
7
+ self.content = ""
8
+ self.url = ""
9
+
10
+ def process_url(self, url: str) -> Dict[str, Any]:
11
+ """Process a web page URL"""
12
+ try:
13
+ # Set headers to mimic a real browser
14
+ headers = {
15
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
16
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
17
+ 'Accept-Language': 'en-US,en;q=0.5',
18
+ 'Accept-Encoding': 'gzip, deflate',
19
+ 'Connection': 'keep-alive',
20
+ }
21
+
22
+ # Fetch the webpage
23
+ response = requests.get(url, headers=headers, timeout=10)
24
+ response.raise_for_status()
25
+
26
+ # Parse with BeautifulSoup
27
+ soup = BeautifulSoup(response.content, 'html.parser')
28
+
29
+ # Remove unwanted elements
30
+ for element in soup(['script', 'style', 'nav', 'header', 'footer', 'aside', 'advertisement']):
31
+ element.decompose()
32
+
33
+ # Extract text content
34
+ text_content = soup.get_text()
35
+
36
+ # Clean up the text
37
+ lines = (line.strip() for line in text_content.splitlines())
38
+ chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
39
+ text_content = ' '.join(chunk for chunk in chunks if chunk)
40
+
41
+ if not text_content.strip():
42
+ return {"status": "error", "message": "No text content could be extracted from the webpage"}
43
+
44
+ # Extract title
45
+ title = soup.find('title')
46
+ page_title = title.get_text().strip() if title else "Untitled"
47
+
48
+ self.content = text_content.strip()
49
+ self.url = url
50
+
51
+ return {
52
+ "status": "success",
53
+ "message": "Web page processed successfully",
54
+ "title": page_title,
55
+ "num_pages": 1,
56
+ "num_chunks": len(text_content.split()) // 100 + 1,
57
+ "word_count": len(text_content.split())
58
+ }
59
+
60
+ except requests.exceptions.RequestException as e:
61
+ return {"status": "error", "message": f"Failed to fetch webpage: {str(e)}"}
62
+ except Exception as e:
63
+ return {"status": "error", "message": f"Error processing webpage: {str(e)}"}
64
+
65
+ def query_response(self, query: str) -> Dict[str, Any]:
66
+ """Answer a query about the web content"""
67
+ if not self.content:
68
+ return {"status": "error", "message": "No web content available"}
69
+
70
+ try:
71
+ # Simple keyword-based search
72
+ answer = self._search_content(query, self.content)
73
+ return {
74
+ "status": "success",
75
+ "answer": answer
76
+ }
77
+ except Exception as e:
78
+ return {"status": "error", "message": str(e)}
79
+
80
+ def get_content(self) -> str:
81
+ """Get the extracted content"""
82
+ return self.content
83
+
84
+ def _search_content(self, query: str, content: str) -> str:
85
+ """Simple keyword-based search"""
86
+ query_words = query.lower().split()
87
+
88
+ # Split content into sentences
89
+ sentences = []
90
+ for sentence in content.split('.'):
91
+ sentence = sentence.strip()
92
+ if len(sentence) > 10: # Filter out very short fragments
93
+ sentences.append(sentence)
94
+
95
+ # Find relevant sentences
96
+ relevant_sentences = []
97
+ for sentence in sentences:
98
+ sentence_lower = sentence.lower()
99
+ score = sum(1 for word in query_words if word in sentence_lower)
100
+ if score > 0:
101
+ relevant_sentences.append((sentence, score))
102
+
103
+ if not relevant_sentences:
104
+ return "I couldn't find information related to your query on this webpage."
105
+
106
+ # Sort by relevance and return top sentences
107
+ relevant_sentences.sort(key=lambda x: x[1], reverse=True)
108
+ top_sentences = [sent[0] for sent in relevant_sentences[:3]]
109
+
110
+ return ". ".join(top_sentences)