Multimodal Input

Last Updated on : 2025-05-07 03:22:46download

Overview

Multimodal input refers to the process where a device interacts with a large AI model through multiple modalities, such as text, audio, and video. The model processes these inputs and returns integrated responses.

How it works

ApplicationSDKCloudStart interactionInput videoInput audioInput textInput fileInput imageUpload multimodal data to the cloudComplete interactionCloud returns resultsOutput the informationApplicationSDKCloud

APIs

Initialization

typedef struct {
    /** recv event */
    OPERATE_RET(*event_cb)(AI_EVENT_TYPE type);
    /** recv media attr */
    OPERATE_RET(*media_attr_cb)(AI_BIZ_ATTR_INFO_T *attr);
    /** recv media data */
    OPERATE_RET(*media_data_cb)(AI_PACKET_PT type, CHAR_T *data, UINT_T len);
    /** recv text stream */
    OPERATE_RET(*text_cb)(AI_TEXT_TYPE_E type, CHAR_T *data, INT_T len);
    /** recv alert */
    OPERATE_RET(*alert_cb)(AI_ALERT_TYPE_E type);
} AI_OUTPUT_CBS_T;
typedef struct {
    UINT32_T biz_code;
    AI_ATTR_BASE_T attr;
    AI_INPUT_SEND_T biz_get[AI_MAX_SESSION_ID_NUM];
    AI_OUTPUT_CBS_T output;
} AI_AGENT_CFG_T;
/**
 * @brief ai agent init
 *
 * @param[in] cfg agent cfg
 *
 * @return OPRT_OK on success. Others on error, please refer to tuya_error_code.h
 */
OPERATE_RET tuya_ai_agent_init(AI_AGENT_CFG_T *cfg);

Input

Start interaction

/**
 * @brief ai input start
 *
 */
VOID tuya_ai_input_start(VOID);

Input audio

/**
 * @brief ai image input
 *
 * @param[in] timestamp image timestamp
 * @param[in] data image data
 * @param[in] len image data length
 *
 * @return OPRT_OK on success. Others on error, please refer to tuya_error_code.h
 */
OPERATE_RET tuya_ai_audio_input(UINT64_T timestamp, UINT64_T pts, BYTE_T *data, UINT_T len);

Input video

/**
 * @brief ai video input
 *
 * @param[in] timestamp video timestamp
 * @param[in] pts video pts
 * @param[in] data video data
 * @param[in] len video data length
 *
 * @return OPRT_OK on success. Others on error, please refer to tuya_error_code.h
 */
OPERATE_RET tuya_ai_video_input(UINT64_T timestamp, UINT64_T pts, BYTE_T *data, UINT_T len);

Input text

/**
 * @brief ai text input
 *
 * @param[in] data text data
 * @param[in] len text data length
 *
 * @return OPRT_OK on success. Others on error, please refer to tuya_error_code.h
 */
OPERATE_RET tuya_ai_text_input(BYTE_T *data, UINT_T len);

Input file

/**
 * @brief ai file input
 *
 * @param[in] data file data
 * @param[in] len file data length
 *
 * @return OPRT_OK on success. Others on error, please refer to tuya_error_code.h
 */
OPERATE_RET tuya_ai_file_input(BYTE_T *data, UINT_T len);

Input image

/**
 * @brief ai image input
 *
 * @param[in] timestamp image timestamp
 * @param[in] data image data
 * @param[in] len image data length
 *
 * @return OPRT_OK on success. Others on error, please refer to tuya_error_code.h
 */
OPERATE_RET tuya_ai_image_input(UINT64_T timestamp, BYTE_T *data, UINT_T len);

Complete interaction

/**
 * @brief ai input stop
 *
 */
VOID tuya_ai_input_stop(VOID);

Output

Invoke callback

Output the response by invoking the output interface registered by tuya_ai_agent_init.

Output local audio

When the device encounters network connectivity issues, disconnections, or ASR recognition failures, it shall utilize built-in audio prompts through the following APIs:

Start output

/**
 * @brief ai output start
 *
 * @return OPRT_OK on success. Others on error, please refer to tuya_error_code.h
 */
OPERATE_RET tuya_ai_output_start(VOID);

Output the information

/**
 * @brief ai output write
 *
 * @param[in] type packet type
 * @param[in] data data buffer
 * @param[in] len data length
 *
 * @return OPRT_OK on success. Others on error, please refer to tuya_error_code.h
 */
OPERATE_RET tuya_ai_output_write(AI_PACKET_PT type, UINT8_T *data, UINT_T len);

Complete output

/**
 * @brief ai output stop
 *
 * @param[in] force force stop
 *
 * @return OPRT_OK on success. Others on error, please refer to tuya_error_code.h
 */
OPERATE_RET tuya_ai_output_stop(BOOL_T force);

Example

#define LONG_KEY_TIME   400 // ms
#define AI_DEMO_INPUT_DELAY 300 // ms

typedef enum {
    RECORDER_STATUS_PROC,
    RECORDER_STATUS_STOPPING,
    RECORDER_STATUS_STOPPED,
} RECORDER_STATUS_E;

typedef struct {
    AI_DEMO_INPUT_CFG_T cfg;
    RECORDER_STATUS_E recorder_state;
} AI_INPUT_CTX_T;

STATIC AI_INPUT_CTX_T ai_input_ctx;

STATIC VOID __ai_trigger_mode_hold(PUSH_KEY_TYPE_E type)
{
    if((type == LONG_KEY) || (type == NORMAL_KEY)) {
        if(ai_input_ctx.recorder_state != RECORDER_STATUS_STOPPED) {
            ai_input_ctx.recorder_state = RECORDER_STATUS_STOPPED;
            tuya_ai_input_stop();
            tuya_ai_output_stop(TRUE);
        }

        tuya_ai_input_start();
        ai_input_ctx.recorder_state = RECORDER_STATUS_PROC;
    } else if(type == RELEASE_KEY) {
        if(ai_input_ctx.recorder_state != RECORDER_STATUS_STOPPED) {
            tal_system_sleep(AI_DEMO_INPUT_DELAY);
            ai_input_ctx.recorder_state = RECORDER_STATUS_STOPPED;
            tuya_ai_input_stop();
        }
    }
}

STATIC VOID __ai_trigger_mode_oneshot(PUSH_KEY_TYPE_E type)
{
    if(type == NORMAL_KEY) {
        ai_demo_player_alert(ALART_TYPE_WAKEUP);
    }
}

STATIC VOID __ai_input_key(UINT_T port, PUSH_KEY_TYPE_E type, INT_T cnt)
{
    PR_DEBUG("key evt: %d", type);

    if(SEQ_KEY == type) { // Toggle AI interaction mode
        ai_input_ctx.cfg.ai_trigger_mode++;
        if(ai_input_ctx.cfg.ai_trigger_mode > AI_TRIGGER_MODE_FREE) {
            ai_input_ctx.cfg.ai_trigger_mode = AI_TRIGGER_MODE_HOLD;
        }

        switch(ai_input_ctx.cfg.ai_trigger_mode) {
            case AI_TRIGGER_MODE_HOLD:
                ai_demo_player_alert(ALART_TYPE_LONG_KEY_TALK);
                break;
            case AI_TRIGGER_MODE_ONE_SHOT:
                ai_demo_player_alert(ALART_TYPE_KEY_TALK);
                break;
            case AI_TRIGGER_MODE_WAKEUP:
                ai_demo_player_alert(ALART_TYPE_WAKEUP_TALK);
                break;
            case AI_TRIGGER_MODE_FREE:
                ai_demo_player_alert(ALART_TYPE_RANDOM_TALK);
                break;
            default:
                break;
        }
    } else { // Trigger AI interaction
        switch(ai_input_ctx.cfg.ai_trigger_mode) {
            case AI_TRIGGER_MODE_HOLD:
                __ai_trigger_mode_hold(type);
                break;
            case AI_TRIGGER_MODE_ONE_SHOT:
            case AI_TRIGGER_MODE_FREE:
            case AI_TRIGGER_MODE_WAKEUP:
                __ai_trigger_mode_oneshot(type);
                break;
            default:
                break;
        }
    }
}

STATIC INT_T __ai_input_audio(TKL_AUDIO_FRAME_INFO_T *pframe)
{
    OPERATE_RET rt = OPRT_OK;

    if(ai_input_ctx.recorder_state != RECORDER_STATUS_PROC) {
        return 1;
    }

    if(pframe->pbuf == NULL || pframe->buf_size == 0) {
        return 1;
    }

    rt = tuya_ai_audio_input(pframe->timestamp, pframe->pts, pframe->pbuf, pframe->buf_size);
    if(rt != OPRT_OK) {
        PR_ERR("ai audio input failed %d", rt);
        return 1;
    }

    return 0;
}

STATIC INT_T __ai_input_video(TKL_VENC_FRAME_T *pframe)
{
    OPERATE_RET rt = OPRT_OK;

    if(ai_input_ctx.recorder_state != RECORDER_STATUS_PROC) {
        return 1;
    }

    if(pframe->pbuf == NULL || pframe->buf_size == 0) {
        return 1;
    }
    if (pframe->frametype != TKL_VIDEO_I_FRAME) {
        return 0;
    }
    rt = tuya_ai_video_input(pframe->timestamp, pframe->pts, pframe->pbuf, pframe->buf_size);
    if(rt != OPRT_OK) {
        PR_ERR("ai video input failed %d", rt);
        return 1;
    }

    return 0;
}

STATIC OPERATE_RET _ai_input_mic_init(AI_DEMO_INPUT_CFG_T *cfg)
{
    OPERATE_RET rt = OPRT_OK;
    TKL_AUDIO_CONFIG_T config = {0};
    AI_BOARD_CFG_T board_cfg = {0};

    if((cfg->ai_trigger_mode == AI_TRIGGER_MODE_ONE_SHOT) ||
        (cfg->ai_trigger_mode == AI_TRIGGER_MODE_FREE)) {
            board_cfg.vad = TRUE;
    } else {
        board_cfg.vad = FALSE;
    }
    ai_board_init(&board_cfg);

    config.enable = cfg->ai_trigger_mode == AI_TRIGGER_MODE_FREE ? 1 : 0;
    config.ai_chn = 0;
    config.sample = AI_AUDIO_INPUT_SAMPLE_RATE;
    config.spk_sample = 16000;
    config.datebits = AI_AUDIO_INPUT_SAMPLE_BITS;
    config.channel = AI_AUDIO_INPUT_CHANNEL;
    config.codectype = TKL_CODEC_AUDIO_PCM;
    config.card = TKL_AUDIO_TYPE_BOARD;
    config.spk_gpio = cfg->spk_en_pin;
    config.spk_gpio_polarity = TUYA_GPIO_LEVEL_LOW;
    config.put_cb = __ai_input_audio;

    TUYA_CALL_ERR_RETURN(tkl_ai_init(&config, 0));
    TUYA_CALL_ERR_RETURN(tkl_ai_start(0, 0));

    return rt;
}

STATIC TKL_DISP_DEVICE_S video_lcd;
STATIC TKL_DISP_INFO_S video_disp_info;
STATIC VOID __ai_video_dvp_init(VOID)
{
    OPERATE_RET rt = OPRT_OK;
    // lcd
    video_disp_info.width = 320;
    video_disp_info.height = 480;
    video_disp_info.fps = 15;
    video_disp_info.format = TKL_DISP_PIXEL_FMT_RGB565;
    video_disp_info.rotation = TKL_DISP_ROTATION_0;

    video_disp_info.ll_ctrl.enable_lcd_pipeline = 0;

    video_disp_info.ll_ctrl.bl.io              = TUYA_GPIO_NUM_9;
    video_disp_info.ll_ctrl.bl.mode            = TKL_DISP_BL_GPIO;
    video_disp_info.ll_ctrl.bl.active_level    = TUYA_GPIO_LEVEL_HIGH;

    video_disp_info.ll_ctrl.spi.clk            = TUYA_GPIO_NUM_49;
    video_disp_info.ll_ctrl.spi.csx            = TUYA_GPIO_NUM_48;
    video_disp_info.ll_ctrl.spi.sda            = TUYA_GPIO_NUM_50;
    video_disp_info.ll_ctrl.spi.rst_mode       = TKL_DISP_GPIO_RESET;
    video_disp_info.ll_ctrl.spi.rst            = TUYA_GPIO_NUM_53;

    video_disp_info.ll_ctrl.power_ctrl_pin     = TUYA_GPIO_NUM_56;     // no lcd ldo
    video_disp_info.ll_ctrl.power_active_level = TUYA_GPIO_LEVEL_HIGH;
    video_disp_info.ll_ctrl.rgb_mode           = TKL_DISP_PIXEL_FMT_RGB565;

    video_disp_info.ll_ctrl.tp.tp_i2c_clk      = TUYA_GPIO_NUM_13;
    video_disp_info.ll_ctrl.tp.tp_i2c_sda      = TUYA_GPIO_NUM_15;
    video_disp_info.ll_ctrl.tp.tp_rst          = TUYA_GPIO_NUM_54;
    video_disp_info.ll_ctrl.tp.tp_intr         = TUYA_GPIO_NUM_55;

    video_disp_info.ll_ctrl.init_param         = NULL;

    // Pull up the lcd rst pin
    TUYA_GPIO_BASE_CFG_T gpio_cfg = {
        .direct = TUYA_GPIO_OUTPUT,
        .mode = TUYA_GPIO_PULLUP,
        .level = TUYA_GPIO_LEVEL_HIGH,
    };
    tkl_gpio_init(TUYA_GPIO_NUM_53, &gpio_cfg);
    tkl_gpio_write(TUYA_GPIO_NUM_53, 1);

    memset(video_disp_info.ll_ctrl.ic_name, 0, IC_NAME_LENGTH);
    int len = (IC_NAME_LENGTH < sizeof("T35P128CQ"))? IC_NAME_LENGTH: strlen("T35P128CQ");
    memcpy(video_disp_info.ll_ctrl.ic_name, "T35P128CQ", len);

    video_lcd.device_info = &video_disp_info;

    tkl_disp_init(&video_lcd, NULL);

    tkl_disp_set_brightness(NULL, 100);

    // dvp
    uint8_t uvc_status = 0xff;
    TKL_VI_CONFIG_T vi_config;
    TKL_VI_EXT_CONFIG_T ext_conf;

    ext_conf.type = TKL_VI_EXT_CONF_CAMERA;
    ext_conf.camera.camera_type = TKL_VI_CAMERA_TYPE_DVP;
    ext_conf.camera.fmt = TKL_CODEC_VIDEO_MJPEG;

    ext_conf.camera.power_pin = TUYA_GPIO_NUM_51;
    ext_conf.camera.active_level = TUYA_GPIO_LEVEL_HIGH;
    ext_conf.camera.i2c.clk = TUYA_GPIO_NUM_13;
    ext_conf.camera.i2c.sda = TUYA_GPIO_NUM_15;

    vi_config.isp.width = 480;
    vi_config.isp.height = 480;
    vi_config.isp.fps = 15;
    vi_config.pdata = &ext_conf;
    tkl_vi_init(&vi_config, 0);

    TKL_VENC_CONFIG_T h264_config = {0};
    h264_config.enable_h264_pipeline = 0;
    h264_config.put_cb = __ai_input_video;
    TUYA_CALL_ERR_RETURN(tkl_venc_init(0, &h264_config, 0));
}

static void __ai_video_uvc_init(void)
{
    OPERATE_RET rt = OPRT_OK;
    // lcd
    video_disp_info.width = 320;
    video_disp_info.height = 480;
    video_disp_info.fps = 15;
    video_disp_info.format = TKL_DISP_PIXEL_FMT_RGB565;
    video_disp_info.rotation = TKL_DISP_ROTATION_0;

    video_disp_info.ll_ctrl.enable_lcd_pipeline = 1;

    video_disp_info.ll_ctrl.bl.io              = TUYA_GPIO_NUM_9;
    video_disp_info.ll_ctrl.bl.mode            = TKL_DISP_BL_GPIO;
    video_disp_info.ll_ctrl.bl.active_level    = TUYA_GPIO_LEVEL_HIGH;

    video_disp_info.ll_ctrl.spi.clk            = TUYA_GPIO_NUM_49;
    video_disp_info.ll_ctrl.spi.csx            = TUYA_GPIO_NUM_48;
    video_disp_info.ll_ctrl.spi.sda            = TUYA_GPIO_NUM_50;
    video_disp_info.ll_ctrl.spi.rst_mode       = TKL_DISP_GPIO_RESET;
    video_disp_info.ll_ctrl.spi.rst            = TUYA_GPIO_NUM_53;

    video_disp_info.ll_ctrl.power_ctrl_pin     = TUYA_GPIO_NUM_56;     // no lcd ldo
    video_disp_info.ll_ctrl.power_active_level = TUYA_GPIO_LEVEL_HIGH;
    video_disp_info.ll_ctrl.rgb_mode           = TKL_DISP_PIXEL_FMT_RGB565;

    video_disp_info.ll_ctrl.tp.tp_i2c_clk      = TUYA_GPIO_NUM_13;
    video_disp_info.ll_ctrl.tp.tp_i2c_sda      = TUYA_GPIO_NUM_15;
    video_disp_info.ll_ctrl.tp.tp_rst          = TUYA_GPIO_NUM_54;
    video_disp_info.ll_ctrl.tp.tp_intr         = TUYA_GPIO_NUM_55;

    video_disp_info.ll_ctrl.init_param         = NULL;

    // Pull up the lcd rst pin
    TUYA_GPIO_BASE_CFG_T gpio_cfg = {
        .direct = TUYA_GPIO_OUTPUT,
        .mode = TUYA_GPIO_PULLUP,
        .level = TUYA_GPIO_LEVEL_HIGH,
    };
    tkl_gpio_init(TUYA_GPIO_NUM_53, &gpio_cfg);
    tkl_gpio_write(TUYA_GPIO_NUM_53, 1);

    memset(video_disp_info.ll_ctrl.ic_name, 0, IC_NAME_LENGTH);
    int len = (IC_NAME_LENGTH < sizeof("T35P128CQ"))? IC_NAME_LENGTH: strlen("T35P128CQ");
    memcpy(video_disp_info.ll_ctrl.ic_name, "T35P128CQ", len);

    video_lcd.device_info = &video_disp_info;

    tkl_disp_init(&video_lcd, NULL);

    tkl_disp_set_brightness(NULL, 100);

    // uvc
    uint8_t uvc_status = 0xff;
    TKL_VI_CONFIG_T vi_config;
    TKL_VI_EXT_CONFIG_T ext_conf;

    ext_conf.type = TKL_VI_EXT_CONF_CAMERA;
    ext_conf.camera.camera_type = TKL_VI_CAMERA_TYPE_UVC;
    ext_conf.camera.fmt = TKL_CODEC_VIDEO_MJPEG;
    ext_conf.camera.power_pin = TUYA_GPIO_NUM_28;
    ext_conf.camera.active_level = TUYA_GPIO_LEVEL_HIGH;
    vi_config.isp.width = 800;
    vi_config.isp.height = 480;
    vi_config.isp.fps = 15;
    vi_config.pdata = &ext_conf;
    tkl_vi_init(&vi_config, 0);

    TKL_VENC_CONFIG_T h264_config = {0};
    h264_config.enable_h264_pipeline = 1;
    h264_config.put_cb = __ai_input_video;
    TUYA_CALL_ERR_RETURN(tkl_venc_init(0, &h264_config, 0));
}

STATIC OPERATE_RET _ai_input_video_init(AI_DEMO_INPUT_CFG_T *cfg)
{
    OPERATE_RET rt = OPRT_OK;
    // __ai_video_dvp_init(); // dvp
    __ai_video_uvc_init(); // uvc
    return rt;
}

STATIC OPERATE_RET _ai_input_key_init(AI_DEMO_INPUT_CFG_T *cfg)
{
    OPERATE_RET rt = OPRT_OK;

    // init gpio
    TUYA_GPIO_BASE_CFG_T key_cfg = {
        .mode = TUYA_GPIO_PULLUP,
        .direct = TUYA_GPIO_INPUT,
        .level = TUYA_GPIO_LEVEL_HIGH
    };
    TUYA_CALL_ERR_LOG(tkl_gpio_init(cfg->ai_trigger_pin, &key_cfg));

    KEY_USER_DEF_S trigger_pin;
    trigger_pin.port                = cfg->ai_trigger_pin;
    trigger_pin.low_level_detect    = TRUE;
    trigger_pin.lp_tp               = LP_ONCE_TRIG;
    trigger_pin.long_key_time       = LONG_KEY_TIME;
    trigger_pin.seq_key_detect_time = 200;
    trigger_pin.call_back           = __ai_input_key;
    key_init(NULL, 0, 20);
    reg_proc_key(&trigger_pin);
    TUYA_CALL_ERR_LOG(tkl_gpio_irq_enable(cfg->ai_trigger_pin));

    return rt;
}

OPERATE_RET ai_demo_input_init(AI_DEMO_INPUT_CFG_T *cfg)
{
    OPERATE_RET rt = OPRT_OK;

    memcpy(&ai_input_ctx.cfg, cfg, SIZEOF(AI_DEMO_INPUT_CFG_T));
    ai_input_ctx.recorder_state = RECORDER_STATUS_STOPPED;

    TUYA_CALL_ERR_RETURN(_ai_input_key_init(cfg));
    TUYA_CALL_ERR_RETURN(_ai_input_mic_init(cfg));
    TUYA_CALL_ERR_RETURN(_ai_input_video_init(cfg));

    return rt;
}
/** recv event */
OPERATE_RET __ai_agent_event_cb(AI_EVENT_TYPE type)
{
    PR_DEBUG("===event type: %d", type);

    if(AI_EVENT_START == type) {
        tkl_player_start();
    } else if(AI_EVENT_END == type) {
        tkl_player_stop();
    }

    return OPRT_OK;
}

/** recv media attr */
OPERATE_RET __ai_agent_media_attr_cb(AI_BIZ_ATTR_INFO_T *attr)
{
    PR_DEBUG("===media attr type: %d", attr->type);
    return OPRT_OK;
}

/** recv media data */
OPERATE_RET __ai_agent_media_data_cb(AI_PACKET_PT type, CHAR_T *data, UINT_T len)
{
    PR_DEBUG("===media data type: %d", type);
    OPERATE_RET rt = OPRT_OK;
    if(type == AI_PT_AUDIO) {
        rt = tkl_player_write_stream((UINT8_T *)data, len);
    } else if(type == AI_PT_VIDEO) {
    } else if(type == AI_PT_IMAGE) {
    } else if(type == AI_PT_FILE) {
    }
    return rt;
}

/** recv text stream */
OPERATE_RET __ai_agent_text_cb(AI_TEXT_TYPE_E type, CHAR_T *data, INT_T len)
{
    // CHAR_T *text[] = {"ASR", "NLG", "SKILL"};

    // if(type <= AI_TEXT_SKILL) {
    //     PR_DEBUG("===text %s: %s", text[type], data);
    // } else {
    //     PR_DEBUG("===text invalid type: %d", type);
    // }

    return OPRT_OK;
}

/** recv alert */
OPERATE_RET __ai_agent_alert_cb(AI_ALERT_TYPE_E type)
{
    PR_DEBUG("===alert type: %d", type);
    switch(type) {
        case AT_POWER_ON:
            ai_demo_player_alert(ALART_TYPE_POWER_ON);
            break;
        case AT_NOT_ACTIVE:
            ai_demo_player_alert(ALART_TYPE_NOT_ACTIVE);
            break;
        case AT_NETWORK_CFG:
            ai_demo_player_alert(ALART_TYPE_NETWORK_CFG);
            break;
        case AT_NETWORK_CONNECTED:
            ai_demo_player_alert(ALART_TYPE_NETWORK_CONNECTED);
            break;
        case AT_NETWORK_FAIL:
            ai_demo_player_alert(ALART_TYPE_NETWORK_FAIL);
            break;
        case AT_NETWORK_DISCONNECT:
            ai_demo_player_alert(ALART_TYPE_NETWORK_DISCONNECT);
            break;
        case AT_BATTERY_LOW:
            ai_demo_player_alert(ALART_TYPE_BATTERY_LOW);
            break;
        case AT_PLEASE_AGAIN:
            ai_demo_player_alert(ALART_TYPE_PLEASE_AGAIN);
            break;
        default:
            break;
    }

    return OPRT_OK;
}

STATIC OPERATE_RET __ai_agent_init(VOID)
{
    OPERATE_RET rt = OPRT_OK;
    AI_AGENT_CFG_T ai_agent_cfg = {0};

    ai_agent_cfg.attr.audio.codec_type = AUDIO_CODEC_PCM;
    ai_agent_cfg.attr.audio.sample_rate = 16000;
    ai_agent_cfg.attr.audio.channels = AUDIO_CHANNELS_MONO;
    ai_agent_cfg.attr.audio.bit_depth = 16;

    ai_agent_cfg.attr.video.codec_type = VIDEO_CODEC_H264;
    ai_agent_cfg.attr.video.sample_rate = 90000;
    ai_agent_cfg.attr.video.fps = 30;
    ai_agent_cfg.attr.video.width = 480;
    ai_agent_cfg.attr.video.height = 480;

    ai_agent_cfg.attr.image.format = IMAGE_FORMAT_JPEG;
    ai_agent_cfg.attr.image.width = 480;
    ai_agent_cfg.attr.image.height = 480;

    ai_agent_cfg.attr.file.format = FILE_FORMAT_MP4;
    memcpy(ai_agent_cfg.attr.file.file_name, "test.mp4", strlen("test.mp4"));

    ai_agent_cfg.output.alert_cb = __ai_agent_alert_cb;
    ai_agent_cfg.output.text_cb = __ai_agent_text_cb;
    ai_agent_cfg.output.media_data_cb = __ai_agent_media_data_cb;
    ai_agent_cfg.output.media_attr_cb = __ai_agent_media_attr_cb;
    ai_agent_cfg.output.event_cb = __ai_agent_event_cb;
    TUYA_CALL_ERR_RETURN(tuya_ai_agent_init(&ai_agent_cfg));

    AI_DEMO_INPUT_CFG_T ai_input_cfg = {
        .ai_trigger_mode = AI_TRIGGER_MODE_HOLD,
        .ai_trigger_pin = TUYA_GPIO_NUM_12,
        .ai_trigger_timeout = 30,
        .spk_en_pin = TUYA_GPIO_NUM_28
    };
    TUYA_CALL_ERR_RETURN(ai_demo_input_init(&ai_input_cfg));
    return rt;
}

Support and help

If you have any problems with TuyaOS development, you can post your questions in the Tuya Developer Forum.