// Copyright (c) 2023 Horizon Robotics.All Rights Reserved.
//
// The material in this file is confidential and contains trade secrets
// of Horizon Robotics Inc. This is proprietary information owned by
// Horizon Robotics Inc. No part of this work may be disclosed,
// reproduced, copied, transmitted, or used in any way for any purpose,
// without the express written permission of Horizon Robotics Inc.

// This is a simple program that describes how to run resnet50 classification
// on an image and get its top k results by predict score.
// Should be noted: Only resnet50 is supported here.

#include <algorithm>
#include <fstream>
#include <iomanip>
#include <iostream>
#include <iterator>
#include <map>
#include <queue>
#include <utility>
#include <random>

#include "gflags/gflags.h"
#include "hlog/logging.h"
#include "hobot/dnn/hb_dnn.h"
#include "hobot/hb_ucp.h"
#include "hobot/hb_ucp_sys.h"
#include "opencv2/core/mat.hpp"
#include "opencv2/imgcodecs.hpp"
#include "opencv2/imgproc.hpp"
#define EMPTY ""

DEFINE_string(model_file, EMPTY, "model file path");
DEFINE_string(image_file, EMPTY, "Test image path");
DEFINE_int32(top_k, 5, "Top k classes, 5 by default");

#define MOUDULE_NAME "DNN_BASIC_SAMPLE"
#define LOGD(err_msg, ...) HFLOGM_D(MOUDULE_NAME, err_msg, ##__VA_ARGS__)
#define LOGI(err_msg, ...) HFLOGM_I(MOUDULE_NAME, err_msg, ##__VA_ARGS__)
#define LOGE(err_msg, ...) HFLOGM_E(MOUDULE_NAME, err_msg, ##__VA_ARGS__)
#define LOGW(err_msg, ...) HFLOGM_W(MOUDULE_NAME, err_msg, ##__VA_ARGS__)

#define HB_CHECK_SUCCESS(value, errmsg)             \
  do {                                              \
    /*value can be call of function*/               \
    auto ret_code = value;                          \
    if (ret_code != 0) {                            \
      LOGE("{}, error code: {}", errmsg, ret_code); \
      return ret_code;                              \
    }                                               \
  } while (0);

typedef struct Classification {
  int id;
  float score;
  const char *class_name;

  Classification() : class_name(0), id(0), score(0.0) {}
  Classification(int id, float score, const char *class_name)
      : id(id), score(score), class_name(class_name) {}

  friend bool operator>(const Classification &lhs, const Classification &rhs) {
    return (lhs.score > rhs.score);
  }

  ~Classification() {}
} Classification;

int prepare_tensor(hbDNNTensor *input_tensor, hbDNNTensor *output_tensor,
                   hbDNNHandle_t dnn_handle);

int32_t read_image_2_tensor_as_nv12(std::string &image_file,
                                    hbDNNTensor *input_tensor);
void get_project_mat(hbDNNTensor *input_tensor);
void get_cached_anchor(hbDNNTensor *input_tensor);
void get_cached_feature(hbDNNTensor *input_tensor);
void get_cached_confidence(hbDNNTensor *input_tensor);
void get_mask(hbDNNTensor *input_tensor);

void get_topk_result(hbDNNTensor *tensor,
                     std::vector<Classification> &top_k_cls, int top_k);

/**
 * Step1: get model handle
 * Step2: prepare input and output tensor
 * Step3: set input data to input tensor
 * Step4: run inference
 * Step5: do postprocess with output data
 * Step6: release resources
 */
int main(int argc, char **argv) {
  // Parsing command line arguments
  gflags::SetUsageMessage(argv[0]);
  gflags::ParseCommandLineFlags(&argc, &argv, true);
  //执行命令行：../x86/bin/run_resnet 
  //            --model_file=../../model/runtime/resnet50/resnet50_224x224_nv12.hbm 
  //            --image_file=../../data/cls_images/zebra_cls.jpg 
  //            --top_k=5
  std::cout << gflags::GetArgv() << std::endl;

  // Init logging
  hobot::hlog::HobotLog::Instance()->SetLogLevel(
      "DNN_BASIC_SAMPLE", hobot::hlog::LogLevel::log_info);

  // DNN句柄，指向打包的多个模型
  hbDNNPackedHandle_t packed_dnn_handle;
  // DNN句柄，指向单一模型
  hbDNNHandle_t dnn_handle;

  const char **model_name_list;
  auto modelFileName = FLAGS_model_file.c_str();
  int model_count = 0;
  // Step1: get model handle
  {
    //从文件完成DNN句柄(指向多个模型)的创建和初始化，可以跨函数、跨线程使用
    HB_CHECK_SUCCESS(
        hbDNNInitializeFromFiles(&packed_dnn_handle, &modelFileName, 1),
        "hbDNNInitializeFromFiles failed");
    //获取DNN句柄(指向多模型)所指向模型的名称列表和个数
    HB_CHECK_SUCCESS(hbDNNGetModelNameList(&model_name_list, &model_count,
                                           packed_dnn_handle),
                     "hbDNNGetModelNameList failed");
    // 从DNN句柄(指向多模型)所指向模型列表中获取一个模型的句柄，可以夸函数、夸线程使用，返回DNN句柄(指向单一模型)
    HB_CHECK_SUCCESS(
        hbDNNGetModelHandle(&dnn_handle, packed_dnn_handle, model_name_list[0]),
        "hbDNNGetModelHandle failed");
  }

  std::vector<hbDNNTensor> input_tensors;
  std::vector<hbDNNTensor> output_tensors;
  int input_count = 0;
  int output_count = 0;
  // Step2: prepare input and output tensor
  {
    // 获取DNN句柄所指向单一模型输入张量的个数
    HB_CHECK_SUCCESS(hbDNNGetInputCount(&input_count, dnn_handle),
                     "hbDNNGetInputCount failed");
    // 获取DNN句柄所指向单一模型输出张量的个数
    HB_CHECK_SUCCESS(hbDNNGetOutputCount(&output_count, dnn_handle),
                     "hbDNNGetOutputCount failed");

    // std::cout << "input_count: " << input_count << std::endl;
    // std::cout << "output_count: " << output_count << std::endl;

    input_tensors.resize(input_count);
    output_tensors.resize(output_count);

    // 为输入输出张量申请系统内存
    prepare_tensor(input_tensors.data(), output_tensors.data(), dnn_handle);
  }

  // std::cout << __LINE__ << std::endl;

  // Step3: set input data to input tensor
  {
    // read a single picture for input_tensor[0], for multi_input model, you
    // should set other input data according to model input properties.
    // 将图像格式转换为设定格式并copy到Tensor中
    HB_CHECK_SUCCESS(
        read_image_2_tensor_as_nv12(FLAGS_image_file, input_tensors.data()),
        "read_image_2_tensor_as_nv12 failed");
    LOGI("read image to tensor as nv12 success");

    get_project_mat(input_tensors.data());
    get_cached_anchor(input_tensors.data());
    get_cached_feature(input_tensors.data());
    get_cached_confidence(input_tensors.data());
    get_mask(input_tensors.data());
  }

  // std::cout << __LINE__ << std::endl;


  hbUCPTaskHandle_t task_handle{nullptr};
  hbDNNTensor *output = output_tensors.data();
  // Step4: run inference
  {
    // make sure memory data is flushed to DDR before inference
    //主动将cache中的数据flush到memory中，防止其它模块访问同一块内存空间时可能会读取到旧数据。
    //Input[0]：mem内存指针   Input[1]：刷新标志符
    for (int i = 0; i < input_count; i++) {
      hbUCPMemFlush(&input_tensors[i].sysMem[0], HB_SYS_MEM_CACHE_CLEAN);
    }

    // generate task handle
    // 根据输入参数创建同步/异步推理任务。
    HB_CHECK_SUCCESS(
        hbDNNInferV2(&task_handle, output, input_tensors.data(), dnn_handle),
        "hbDNNInferV2 failed");

    // std::cout << "Line: " << __LINE__ << std::endl;

    // submit task
    hbUCPSchedParam ctrl_param;
    HB_UCP_INITIALIZE_SCHED_PARAM(&ctrl_param);
    ctrl_param.backend = HB_UCP_BPU_CORE_ANY;
    HB_CHECK_SUCCESS(hbUCPSubmitTask(task_handle, &ctrl_param),
                     "hbUCPSubmitTask failed");
    
    // std::cout << "Line: " << __LINE__ << std::endl;

    // wait task done
    HB_CHECK_SUCCESS(hbUCPWaitTaskDone(task_handle, 0),
                     "hbUCPWaitTaskDone failed");
  }

  // std::cout << "Line: " << __LINE__ << std::endl;


  // Step5: do postprocess with output data
  std::vector<Classification> top_k_cls;
  {
    // make sure CPU read data from DDR before using output tensor data
    for (int i = 0; i < output_count; i++) {
      hbUCPMemFlush(&output_tensors[i].sysMem[0], HB_SYS_MEM_CACHE_INVALIDATE);
    }

    // std::cout << "Line: " << __LINE__ << std::endl;

    get_topk_result(output, top_k_cls, FLAGS_top_k);

    // std::cout << "Line: " << __LINE__ << std::endl;

    for (int i = 0; i < FLAGS_top_k; i++) {
      LOGI("TOP {} result id: {}", i, top_k_cls[i].id);
    }
  }

  // Step6: release resources
  {
    // release task handle
    HB_CHECK_SUCCESS(hbUCPReleaseTask(task_handle), "hbUCPReleaseTask failed");
    // free input mem
    for (int i = 0; i < input_count; i++) {
      HB_CHECK_SUCCESS(hbUCPFree(&(input_tensors[i].sysMem[0])),
                       "hbUCPFree failed");
    }
    // free output mem
    for (int i = 0; i < output_count; i++) {
      HB_CHECK_SUCCESS(hbUCPFree(&(output_tensors[i].sysMem[0])),
                       "hbUCPFree failed");
    }
    // release model
    HB_CHECK_SUCCESS(hbDNNRelease(packed_dnn_handle), "hbDNNRelease failed");
  }

  return 0;
}

#define ALIGN(value, alignment) (((value) + ((alignment)-1)) & ~((alignment)-1))
#define ALIGN_32(value) ALIGN(value, 32)

int prepare_tensor(hbDNNTensor *input_tensor, hbDNNTensor *output_tensor,
                   hbDNNHandle_t dnn_handle) {
  int input_count = 0;
  int output_count = 0;
  hbDNNGetInputCount(&input_count, dnn_handle);
  hbDNNGetOutputCount(&output_count, dnn_handle);

  std::cout << "input_count: " << input_count << std::endl;
  std::cout << "output_count: " << output_count << std::endl;


  /** Tips:
   * For input memory size in most cases:
   * *   input_memSize = input[i].properties.alignedByteSize
   * but here for dynamic stride of y and uv，alignedByteSize is not fixed
   * For output memory size:
   * *   output_memSize = output[i].properties.alignedByteSize
   */
  hbDNNTensor *input = input_tensor;
  for (int i = 0; i < input_count; i++) {
    // 获取DNN句柄所指向单个模型的特定输入张量的属性
    HB_CHECK_SUCCESS(
        hbDNNGetInputTensorProperties(&input[i].properties, dnn_handle, i),
        "hbDNNGetInputTensorProperties failed");

    /** Tips:
     * For input tensor, usually need to pad the input data according to stride obtained from properties.
     * but here for dynamic stride of y and uv，user needs to specify a value which should be 32 bytes aligned for the -1 position in stride.
     * */
    // 为每个张量维度设置步长
    auto dim_len = input[i].properties.validShape.numDimensions; //张量有效内容shape的维度数量
    std::cout << ">>>> dim_len: " << dim_len << std::endl;
    for (int32_t dim_i = dim_len - 1; dim_i >= 0; --dim_i) {
      if (input[i].properties.stride[dim_i] == -1) {
        auto cur_stride =
            input[i].properties.stride[dim_i + 1] *
            input[i].properties.validShape.dimensionSize[dim_i + 1];
        input[i].properties.stride[dim_i] = ALIGN_32(cur_stride);
      }
    }

    int input_memSize = input[i].properties.stride[0] *
                        input[i].properties.validShape.dimensionSize[0];
    
    /***************************debug*******************************/
    std::cout << "input_stride: [";
    for (int32_t dim_i = 0; dim_i < dim_len; ++dim_i) {
      std::cout << input[i].properties.stride[dim_i] << ", ";
    }
    std::cout << "]" << std::endl;

    std::cout << "input_dimensionSize: [";
    for (int32_t dim_i = 0; dim_i < dim_len; ++dim_i) {
      std::cout << input[i].properties.validShape.dimensionSize[dim_i] << ", ";
    }
    std::cout << "]" << std::endl;

    // std::cout << __FILE__ << ": " << __LINE__ << ": " << input[i].properties.stride[0] << ", " 
    //                                 << input[i].properties.validShape.dimensionSize[0] << std::endl;
    /****************************debug******************************/

    // 申请缓存的系统内存
    HB_CHECK_SUCCESS(hbUCPMallocCached(&input[i].sysMem[0], input_memSize, 0),
                     "hbUCPMallocCached failed");

    // Show how to get input name
    // 获取DNN句柄所指向单个模型输入张量的名称
    const char *input_name;
    HB_CHECK_SUCCESS(hbDNNGetInputName(&input_name, dnn_handle, i),
                     "hbDNNGetInputName failed");
    LOGI("input[{}] name is {}", i, input_name);
  }

  hbDNNTensor *output = output_tensor;
  for (int i = 0; i < output_count; i++) {
    // 获取DNN句柄所指向单个模型特定输出张量的属性
    HB_CHECK_SUCCESS(
        hbDNNGetOutputTensorProperties(&output[i].properties, dnn_handle, i),
        "hbDNNGetOutputTensorProperties failed");
    int output_memSize = output[i].properties.alignedByteSize; // 获取张量对齐内容的内存大小
    HB_CHECK_SUCCESS(hbUCPMallocCached(&output[i].sysMem[0], output_memSize, 0),
                     "hbUCPMallocCached failed");

    // Show how to get output name
    // 获取DNN句柄所指向单个模型输出张量的名称
    const char *output_name;
    HB_CHECK_SUCCESS(hbDNNGetOutputName(&output_name, dnn_handle, i),
                     "hbDNNGetOutputName failed");
    LOGI("output[{}] name is {}", i, output_name);
  }
  return 0;
}


/** You can define read_image_2_tensor_as_other_type to prepare your data **/
int32_t read_image_2_tensor_as_nv12(std::string &image_file_path,
                                    hbDNNTensor *input_tensor) {
  std::vector<std::string> camera_dir = {"0/n008-2018-08-01-15-16-36-0400__CAM_FRONT__1533151604012404.jpg",
                                         "1/n008-2018-08-01-15-16-36-0400__CAM_FRONT_LEFT__1533151604004799.jpg",
                                         "2/n008-2018-08-01-15-16-36-0400__CAM_BACK_LEFT__1533151604047405.jpg",
                                         "3/n008-2018-08-01-15-16-36-0400__CAM_BACK__1533151604037558.jpg",
                                         "4/n008-2018-08-01-15-16-36-0400__CAM_BACK_RIGHT__1533151604028370.jpg",
                                         "5/n008-2018-08-01-15-16-36-0400__CAM_FRONT_RIGHT__1533151604020482.jpg"};
  // the struct of input shape is NHWC
  int input_h = input_tensor[0].properties.validShape.dimensionSize[1];
  int input_w = input_tensor[0].properties.validShape.dimensionSize[2];
  int input_n = input_tensor[0].properties.validShape.dimensionSize[0];

  // copy y data
  uint8_t *y_data_dst =
      reinterpret_cast<uint8_t *>(input_tensor[0].sysMem[0].virAddr);
  uint8_t *uv_data_dst =
    reinterpret_cast<uint8_t *>(input_tensor[1].sysMem[0].virAddr);


  // std::cout << "input_tensor[0].properties.stride[1]: " << input_tensor[0].properties.stride[1] << std::endl;
  
  for(size_t i = 0; i < input_n; ++i){
    std::string image_file = image_file_path + '/' + camera_dir[i / 2];

    // std::cout << __LINE__ << ": " << image_file << std::endl;

    cv::Mat bgr_mat = cv::imread(image_file, cv::IMREAD_COLOR);
    if (bgr_mat.empty()) {
      LOGE("image file not exist!");
      return -1;
    }
    // resize
    cv::Mat mat;
    mat.create(input_h, input_w, bgr_mat.type());
    // std::cout << __LINE__ << ": input_wh: " << input_h << ", " << input_w << std::endl;
    cv::resize(bgr_mat, mat, mat.size(), 0, 0);
    // convert to YUV420
    if (input_h % 2 || input_w % 2) {
      LOGE("input img height and width must aligned by 2!");
      return -1;
    }

    // std::cout << __LINE__ << std::endl;
    
    cv::Mat yuv_mat;
    cv::cvtColor(mat, yuv_mat, cv::COLOR_BGR2YUV_I420);
    uint8_t *yuv_data = yuv_mat.ptr<uint8_t>();
    uint8_t *y_data_src = yuv_data;

    // // copy y data
    // uint8_t *y_data_dst =
    //     reinterpret_cast<uint8_t *>(input_tensor[0].sysMem[0].virAddr);

    // std::cout << __LINE__ << std::endl;
    
    for (int32_t h = 0; h < input_h; ++h) {
      memcpy(y_data_dst, y_data_src, input_w);
      y_data_src += input_w;
      // add padding
      y_data_dst += input_tensor[0].properties.stride[1];
    }

    // std::cout << __LINE__ << std::endl;

    // copy uv data
    int32_t uv_height = input_tensor[1].properties.validShape.dimensionSize[1];
    int32_t uv_width = input_tensor[1].properties.validShape.dimensionSize[2];
    // uint8_t *uv_data_dst =
    //     reinterpret_cast<uint8_t *>(input_tensor[1].sysMem[0].virAddr);
    uint8_t *u_data_src = yuv_data + input_h * input_w;
    uint8_t *v_data_src = u_data_src + uv_height * uv_width;

    for (int32_t h = 0; h < uv_height; ++h) {
      auto *cur_data = uv_data_dst;
      for (int32_t w = 0; w < uv_width; ++w) {
        *cur_data++ = *u_data_src++;
        *cur_data++ = *v_data_src++;
      }
      // add padding
      uv_data_dst += input_tensor[1].properties.stride[1];
    }
  }
  return 0;
}


void get_project_mat(hbDNNTensor *input_tensor){
  float project_mat[4][4] =
    {
      {6.6223e+02, -1.5405e+01,  4.0376e+00, -1.0416e+03},
      {3.6797e+01,  5.7126e+01, -5.5237e+02,  7.4584e+02},
      {5.6866e-01,  8.2239e-01,  1.7202e-02, -1.3333e+00},
      {0.0000e+00,  0.0000e+00,  0.0000e+00,  1.0000e+00}
    };

  int32_t dim0 = input_tensor[2].properties.validShape.dimensionSize[0];
  int32_t dim1 = input_tensor[2].properties.validShape.dimensionSize[1];
  int32_t dim2 = input_tensor[2].properties.validShape.dimensionSize[2];

  uint8_t *data_dst =
      reinterpret_cast<uint8_t *>(input_tensor[2].sysMem[0].virAddr);
  for (int32_t d0 = 0; d0 < dim0; ++d0){
    for(int32_t d1 = 0; d1  < dim1; ++d1){
      for(int32_t d2 = 0; d2 < dim2; ++d2){
        memcpy(data_dst, &project_mat[d1][d2], sizeof(float));
        data_dst += sizeof(float);
      }
    }
  }
}

void get_cached_anchor(hbDNNTensor *input_tensor){
  int32_t dim0 = input_tensor[3].properties.validShape.dimensionSize[0];
  int32_t dim1 = input_tensor[3].properties.validShape.dimensionSize[1];
  int32_t dim2 = input_tensor[3].properties.validShape.dimensionSize[2];

  std::random_device rd;
  std::mt19937 gen(rd());
  float min = 1.0f;
  float max = 10.0f;
  std::uniform_real_distribution<float> dis(min, max);
  uint8_t *data_dst =
      reinterpret_cast<uint8_t *>(input_tensor[3].sysMem[0].virAddr);
  for (int32_t d0 = 0; d0 < dim0; ++d0){
    for(int32_t d1 = 0; d1  < dim1; ++d1){
      for(int32_t d2 = 0; d2 < dim2; ++d2){
        float random_val = dis(gen);
        memcpy(data_dst, &random_val, sizeof(float));
        data_dst += sizeof(float);
      }
    }
  }
}

void get_cached_feature(hbDNNTensor *input_tensor){
  int32_t dim0 = input_tensor[4].properties.validShape.dimensionSize[0];
  int32_t dim1 = input_tensor[4].properties.validShape.dimensionSize[1];
  int32_t dim2 = input_tensor[4].properties.validShape.dimensionSize[2];

  std::random_device rd;
  std::mt19937 gen(rd());
  float min = 1.0f;
  float max = 10.0f;
  std::uniform_real_distribution<float> dis(min, max);
  uint8_t *data_dst =
      reinterpret_cast<uint8_t *>(input_tensor[4].sysMem[0].virAddr);
  for (int32_t d0 = 0; d0 < dim0; ++d0){
    for(int32_t d1 = 0; d1  < dim1; ++d1){
      for(int32_t d2 = 0; d2 < dim2; ++d2){
        float random_val = dis(gen);
        memcpy(data_dst, &random_val, sizeof(float));
        data_dst += sizeof(float);
      }
    }
  }
}

void get_cached_confidence(hbDNNTensor *input_tensor){
  int32_t dim0 = input_tensor[5].properties.validShape.dimensionSize[0];
  int32_t dim1 = input_tensor[5].properties.validShape.dimensionSize[1];

  std::random_device rd;
  std::mt19937 gen(rd());
  float min = 0.01f;
  float max = 0.99f;
  std::uniform_real_distribution<float> dis(min, max);
  uint8_t *data_dst =
      reinterpret_cast<uint8_t *>(input_tensor[5].sysMem[0].virAddr);
  for (int32_t d0 = 0; d0 < dim0; ++d0){
    for(int32_t d1 = 0; d1  < dim1; ++d1){
      float random_val = dis(gen);
      memcpy(data_dst, &random_val, sizeof(float));
      data_dst += sizeof(float);
    }
  }
}

void get_mask(hbDNNTensor *input_tensor){
  uint8_t *data_dst =
      reinterpret_cast<uint8_t *>(input_tensor[6].sysMem[0].virAddr);
  uint8_t mask_value = static_cast<uint8_t>(true);
  memcpy(data_dst, &mask_value, sizeof(uint8_t));
}

void get_topk_result(hbDNNTensor *tensor,
                     std::vector<Classification> &top_k_cls, int top_k) {
  hbUCPMemFlush(&(tensor->sysMem[0]), HB_SYS_MEM_CACHE_INVALIDATE);
  std::priority_queue<Classification, std::vector<Classification>,
                      std::greater<Classification>>
      queue;
  // The type reinterpret_cast should be determined according to the output type
  // For example: HB_DNN_TENSOR_TYPE_F32 is float
  auto data = reinterpret_cast<float *>(tensor->sysMem[0].virAddr);
  auto quanti_type{tensor->properties.quantiType};
  // For example model, quantiType is NONE and no dequantize processing is required.
  if (quanti_type != hbDNNQuantiType::NONE) {
    LOGE("quanti_type is not NONE, and the output needs to be dequantized!");
  }
  // 1000 classification score values
  int tensor_len = 1000;
  for (auto i = 0; i < tensor_len; i++) {
    float score = data[i];
    queue.push(Classification(i, score, ""));
    if (queue.size() > top_k) {
      queue.pop();
    }
  }
  while (!queue.empty()) {
    top_k_cls.emplace_back(queue.top());
    queue.pop();
  }
  std::reverse(top_k_cls.begin(), top_k_cls.end());
}
