Test / app.py
ahm14's picture
Update app.py
3cf4930 verified
import streamlit as st
from docx import Document
from transformers import pipeline
from langdetect import detect
import spacy
import os
# Check if the spaCy model exists, if not, download it
model_name = "en_core_web_sm"
if not spacy.util.is_package(model_name):
os.system(f"python -m spacy download {model_name}")
nlp = spacy.load(model_name)
# Load Llama 3 summarization model
llama_summarizer = pipeline("summarization", model="meta-llama/Meta-Llama-3-8B")
# Load Gemma 2-9B-IT for recommendations
gemma_recommender = pipeline("text-generation", model="google/gemma-2b-it")
# Function to extract text from a DOCX file
def extract_text_from_docx(docx_file):
doc = Document(docx_file)
text = "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
return text
# Function to detect document language
def detect_language(text):
return detect(text)
# Function to extract metadata
def extract_metadata(text):
doc = nlp(text)
word_count = len(text.split())
entities = {ent.label_: ent.text for ent in doc.ents}
return {
"Word Count": word_count,
"Entities": entities
}
# Function to generate abstract (summary) using Llama 3
def generate_summary(text):
summary = llama_summarizer(text, max_length=200, min_length=50, do_sample=False)
return summary[0]['summary_text']
# Function to generate recommendations using Gemma 2-9B-IT
def generate_recommendations(text):
prompt = f"Provide three key recommendations based on the following document:\n{text}\n\nRecommendations:"
recommendations = gemma_recommender(prompt, max_length=300, num_return_sequences=1, do_sample=False)
return recommendations[0]['generated_text']
# Streamlit UI
st.title("πŸ“„ AI-Powered Multi-Language Document Analyzer")
uploaded_file = st.file_uploader("Upload a Word Document", type=["docx"])
if uploaded_file:
st.success("File uploaded successfully!")
# Extract text
doc_text = extract_text_from_docx(uploaded_file)
# Detect language
language = detect_language(doc_text)
# Extract metadata
metadata = extract_metadata(doc_text)
st.subheader("Extracted Text:")
st.text_area("Document Content", doc_text, height=250)
st.subheader("πŸ—£οΈ Detected Language:")
st.write(language)
st.subheader("πŸ“Š Metadata:")
st.json(metadata)
if st.button("Generate Abstract & Recommendations"):
with st.spinner("Analyzing..."):
summary = generate_summary(doc_text)
recommendations = generate_recommendations(doc_text)
st.subheader("πŸ“Œ Abstract (Summary) - Llama 3")
st.write(summary)
st.subheader("βœ… Recommendations - Gemma 2-9B-IT")
st.write(recommendations)