import bs4
import shutil
import json
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.globals import set_verbose
import os
import glob
from typing import List
from multiprocessing import Pool
from tqdm import tqdm
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.messages import AIMessage, HumanMessage
from flask import Flask, request, jsonify
from langchain_community.document_loaders import (
    CSVLoader,
    EverNoteLoader,
    PyMuPDFLoader,
    TextLoader,
    UnstructuredEPubLoader,
    UnstructuredHTMLLoader,
    UnstructuredMarkdownLoader,
    UnstructuredODTLoader,
    UnstructuredPowerPointLoader,
    UnstructuredWordDocumentLoader,
)
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_community.docstore.document import Document
from langchain_community.llms import OpenAI
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.prompts.prompt import PromptTemplate

app = Flask(__name__)

os.environ["OPENAI_API_KEY"] = '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'
persist_directory = os.environ.get('PERSIST_DIRECTORY')
embeddings_model_name = os.environ.get('EMBEDDINGS_MODEL_NAME')
chunk_size = 1000
chunk_overlap = 200

LOADER_MAPPING = {
    ".csv": (CSVLoader, {}),
    ".doc": (UnstructuredWordDocumentLoader, {}),
    ".docx": (UnstructuredWordDocumentLoader, {}),
    ".enex": (EverNoteLoader, {}),
    ".epub": (UnstructuredEPubLoader, {}),
    ".html": (UnstructuredHTMLLoader, {}),
    ".md": (UnstructuredMarkdownLoader, {}),
    ".odt": (UnstructuredODTLoader, {}),
    ".pdf": (PyPDFLoader, {}),
    ".ppt": (UnstructuredPowerPointLoader, {}),
    ".pptx": (UnstructuredPowerPointLoader, {}),
    ".txt": (TextLoader, {"encoding": "utf8"}),
}

def load_single_document(file_path: str) -> List[Document]:
    ext = "." + file_path.rsplit(".", 1)[-1].lower()
    if ext in LOADER_MAPPING:
        loader_class, loader_args = LOADER_MAPPING[ext]
        loader = loader_class(file_path, **loader_args)
        if ext in [".doc", ".docx", ".pdf"]:
            images = loader.get_images()
        else:
            images = None
        texts = loader.get_texts()
        # Process the images and texts as needed
        return [Document(page_content=text, images=images) for text in texts]
    
    raise ValueError(f"Unsupported file extension '{ext}'")

def load_documents(source_dir: str, ignored_files: List[str] = []) -> List[Document]:
    all_files = []
    for ext in LOADER_MAPPING:
        all_files.extend(
            glob.glob(os.path.join(source_dir, f"**/*{ext.lower()}"), recursive=True)
        )
        all_files.extend(
            glob.glob(os.path.join(source_dir, f"**/*{ext.upper()}"), recursive=True)
        )
    filtered_files = [file_path for file_path in all_files if file_path not in ignored_files]

    with Pool(processes=os.cpu_count()) as pool:
        results = []
        with tqdm(total=len(filtered_files), desc='Loading new documents', ncols=80) as pbar:
            for i, docs in enumerate(pool.imap_unordered(load_single_document, filtered_files)):
                results.extend(docs)
                pbar.update()

    return results

def process_documents(source_directory, ignored_files: List[str] = []) -> List[Document]:
    print(f"Loading documents from {source_directory}")
    documents = load_documents(source_directory, ignored_files)
    if not documents:
        print("No new documents to load")
        exit(0)
    print(f"Loaded {len(documents)} new documents from {source_directory}")
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    documents = text_splitter.split_documents(documents)
    print(f"Split into {len(documents)} chunks of text (max. {chunk_size} tokens each)")
    return documents

@app.route('/save_embeddings', methods=['POST'])
def save_embeddings_to_db():
    try:
        data = request.get_json()
        folder_name = data['folder']
        api_key = data['api_key']
        folder_path = data['folder_path']
        delete_folder = data['delete_folder']
        emb_folder = "chromadb/" + delete_folder
        if os.path.exists(emb_folder):
            shutil.rmtree(emb_folder)
            print("Directory '{}' deleted successfully.".format(emb_folder))
        documents = process_documents(folder_path)
        embeddings = OpenAIEmbeddings(model="text-embedding-3-large", openai_api_key=api_key)
        vectordb = Chroma.from_documents(documents=documents, embedding=embeddings, persist_directory="chromadb/" + folder_name)
        vectordb.persist()
        return jsonify({'message': 'Embeddings processed and saved', 'status' : 'success'})
    except Exception as e:
        return jsonify({'error': str(e)})

@app.route('/get_responses', methods=['POST'])
def get_responses():
    try:
        set_verbose(True)
        data = request.get_json()
        query = data.get('query', '') 
        api_key = data.get('api_key', '') 
        context = data.get('context', '') 
        instructions_json = data.get('instructions', '[]')
        instructions = json.loads(instructions_json)
        question = query
        persist_dir = data.get('folder')

        llm = ChatOpenAI(model="gpt-4-turbo", temperature=1.0)
        embeddings = OpenAIEmbeddings(model="text-embedding-3-large", openai_api_key=api_key)
        vectorstore = Chroma(persist_directory="chromadb/" + persist_dir, embedding_function=embeddings)
        retriever = vectorstore.as_retriever()

        contextualize_q_system_prompt = """Given a chat history and the latest user question \
        which might reference context in the chat history, formulate a standalone question \
        which can be understood without the chat history. Do NOT answer the question, \
        just reformulate it if needed and otherwise return it as is.\
        Always provide answers in html format using tags. \
        If asked for points or steps or details of anything, give answer in html ol or ul tags. 
        """
        contextualize_q_prompt = ChatPromptTemplate.from_messages(
            [
                ("system", contextualize_q_system_prompt),
                MessagesPlaceholder(variable_name="chat_history"),
                ("human", "{question}"),
            ]
        )
        contextualize_q_chain = contextualize_q_prompt | llm | StrOutputParser()

        qa_instructions = ""
        for instruction in instructions:
            qa_instructions += f"{instruction}\n"

        qa_system_prompt = ""
        qa_system_prompt += """\nYou are an assistant for question-answering tasks."""
        qa_system_prompt += """\nUse the following pieces of retrieved context to answer the question."""
        qa_system_prompt += """\nUse <br> as line seperator and eliminate '\\n'."""
        qa_system_prompt += """\nIf asked for points or steps or details of anything, give answer in html ol or ul tags."""

        for instruction in instructions:
            qa_system_prompt += f"\n{instruction}"

        qa_system_prompt += """\n{context}"""

        qa_prompt = ChatPromptTemplate.from_messages(
            [
                ("system", qa_system_prompt),
                MessagesPlaceholder(variable_name="chat_history"),
                ("human", "{question}"),
            ]
        )

        def contextualized_question(input: dict):
            if input.get("chat_history"):
                return contextualize_q_chain
            else:
                return input["question"]

        rag_chain = (
            RunnablePassthrough.assign(
                context=contextualized_question | retriever
            )
            | qa_prompt
            | llm
        )

        chat_history = []

        for item in context:
            chat_history.extend([HumanMessage(content=item.get('question')), AIMessage(content=item.get('answer'))])

        ai_msg = rag_chain.invoke({"question": question, "chat_history": chat_history})
        return str(ai_msg)
        
    except Exception as e:
        return jsonify({'error': str(e)})

if __name__ == '__main__':
    app.run(debug=True)
