Deeksha commited on
Commit
fa9d3ad
·
0 Parent(s):

Initial clean commit for Hugging Face deployment

Browse files
Files changed (5) hide show
  1. .DS_Store +0 -0
  2. .gitignore +5 -0
  3. Dockerfile +23 -0
  4. requirements.txt +10 -0
  5. streamlit_app.py +156 -0
.DS_Store ADDED
Binary file (6.15 kB). View file
 
.gitignore ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ __pycache__/
2
+ *.pyc
3
+ venv/
4
+ .env
5
+ faiss_index/
Dockerfile ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use official lightweight Python image
2
+ FROM python:3.10-slim
3
+
4
+ # Set environment variables to disable usage stats collection (to prevent write errors)
5
+ ENV STREAMLIT_BROWSER_GATHERUSAGESTATS=false
6
+ ENV STREAMLIT_DISABLE_WATCHDOG_WARNINGS=true
7
+ ENV STREAMLIT_SERVER_HEADLESS=true
8
+ ENV STREAMLIT_SERVER_PORT=7860
9
+ ENV STREAMLIT_SERVER_ADDRESS=0.0.0.0
10
+ ENV HOME=/tmp
11
+
12
+ # Set working directory
13
+ WORKDIR /app
14
+
15
+ # Copy requirements and install
16
+ COPY requirements.txt .
17
+ RUN pip install --no-cache-dir -r requirements.txt
18
+
19
+ # Copy the rest of the code
20
+ COPY . .
21
+
22
+ # Run the app
23
+ CMD ["streamlit", "run", "chatpdf1.py"]
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ google-generativeai
3
+ python-dotenv
4
+ langchain
5
+ langchain-community
6
+ langchain-google-genai
7
+ faiss-cpu
8
+ PyPDF2
9
+ python-docx
10
+ beautifulsoup4
streamlit_app.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from PyPDF2 import PdfReader
3
+ from docx import Document
4
+ from bs4 import BeautifulSoup
5
+ import os
6
+ import google.generativeai as genai
7
+ from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
8
+ from langchain_community.vectorstores import FAISS
9
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
10
+ from langchain.chains.question_answering import load_qa_chain
11
+ from langchain.prompts import PromptTemplate
12
+ from dotenv import load_dotenv
13
+
14
+ # ========================
15
+ # 1️⃣ Configuration
16
+ # ========================
17
+ # Load environment variables and API key
18
+ load_dotenv()
19
+ api_key = os.getenv("GOOGLE_API_KEY")
20
+ if not api_key:
21
+ st.error("GOOGLE_API_KEY not found. Please add it to your .env file.")
22
+ st.stop()
23
+
24
+ genai.configure(api_key=api_key)
25
+
26
+ # ========================
27
+ # 2️⃣ File Size Limits
28
+ # ========================
29
+ MAX_TOTAL_SIZE_MB = 5
30
+ MAX_FILE_SIZE_MB = 2
31
+
32
+ def validate_file_sizes(uploaded_files):
33
+ total_size = 0
34
+ for file in uploaded_files:
35
+ size_mb = file.size / (1024 * 1024)
36
+ if size_mb > MAX_FILE_SIZE_MB:
37
+ st.warning(f"{file.name} is too large ({size_mb:.2f} MB). Limit is {MAX_FILE_SIZE_MB} MB per file.")
38
+ return False
39
+ total_size += size_mb
40
+
41
+ if total_size > MAX_TOTAL_SIZE_MB:
42
+ st.warning(f"Total size of uploaded files is {total_size:.2f} MB. Limit is {MAX_TOTAL_SIZE_MB} MB in total.")
43
+ return False
44
+
45
+ return True
46
+
47
+ # ========================
48
+ # 3️⃣ Text Extraction Functions
49
+ # ========================
50
+ def get_pdf_text(pdf_docs):
51
+ text = ""
52
+ for pdf in pdf_docs:
53
+ pdf_reader = PdfReader(pdf)
54
+ for page in pdf_reader.pages:
55
+ content = page.extract_text()
56
+ if content:
57
+ text += content
58
+ return text
59
+
60
+ def get_docx_text(docx_file):
61
+ doc = Document(docx_file)
62
+ return "\n".join([para.text for para in doc.paragraphs])
63
+
64
+ def get_html_text(html_file):
65
+ content = html_file.read()
66
+ soup = BeautifulSoup(content, "html.parser")
67
+ return soup.get_text()
68
+
69
+ # ========================
70
+ # 4️⃣ Text Chunking and Vector Store
71
+ # ========================
72
+ def get_text_chunks(text):
73
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200)
74
+ return text_splitter.split_text(text)
75
+
76
+ def get_vector_store(text_chunks):
77
+ embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
78
+ vector_store = FAISS.from_texts(text_chunks, embedding=embeddings)
79
+ vector_store.save_local("faiss_index")
80
+
81
+ # ========================
82
+ # 5️⃣ Conversational Chain Setup
83
+ # ========================
84
+ def get_conversational_chain():
85
+ prompt_template = """
86
+ Answer the question as detailed as possible from the provided context. If the answer is not available, say "answer is not available in the context."
87
+
88
+ Context:
89
+ {context}
90
+
91
+ Question:
92
+ {question}
93
+
94
+ Answer:
95
+ """
96
+ model = ChatGoogleGenerativeAI(model="gemini-1.5-flash", temperature=0.3)
97
+ prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
98
+ chain = load_qa_chain(model, chain_type="stuff", prompt=prompt)
99
+ return chain
100
+
101
+ def user_input(user_question):
102
+ embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
103
+ new_db = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)
104
+ docs = new_db.similarity_search(user_question)
105
+
106
+
107
+
108
+ chain = get_conversational_chain()
109
+ response = chain({"input_documents": docs, "question": user_question}, return_only_outputs=True)
110
+ st.write("Reply:", response["output_text"])
111
+
112
+ # ========================
113
+ # 6️⃣ Streamlit App Layout
114
+ # ========================
115
+ def main():
116
+ st.set_page_config(page_title="Chat with Documents")
117
+ st.header("Chat with your PDF, DOCX, or HTML using Gemini 💬")
118
+
119
+ user_question = st.text_input("Ask a question about your uploaded files:")
120
+
121
+ if user_question:
122
+ user_input(user_question)
123
+
124
+ with st.sidebar:
125
+ st.title("Upload & Process Files")
126
+ uploaded_files = st.file_uploader("Upload PDF, DOCX, or HTML files", accept_multiple_files=True, type=['pdf', 'docx', 'html'])
127
+
128
+
129
+
130
+
131
+ if st.button("Submit & Process"):
132
+ if not uploaded_files:
133
+ st.warning("Please upload at least one file.")
134
+ return
135
+
136
+ if not validate_file_sizes(uploaded_files):
137
+ return
138
+
139
+ with st.spinner("Processing files..."):
140
+ full_text = ""
141
+ for file in uploaded_files:
142
+ if file.name.endswith(".pdf"):
143
+ full_text += get_pdf_text([file])
144
+ elif file.name.endswith(".docx"):
145
+ full_text += get_docx_text(file)
146
+ elif file.name.endswith(".html"):
147
+ full_text += get_html_text(file)
148
+ else:
149
+ st.warning(f"Unsupported file type: {file.name}")
150
+
151
+ text_chunks = get_text_chunks(full_text)
152
+ get_vector_store(text_chunks)
153
+ st.success("Processing complete!")
154
+
155
+ if __name__ == "__main__":
156
+ main()