转自:
前面分享过一个算法《》
主要用于评估一定长度音频的音量强度,
而分析之后,很多类似的需求,肯定是做音频增益,提高音量诸如此类做法。
不过在项目实测的时候,其实真的很难定标准,
到底在什么样的环境下,要增大音量,还是降低。
在通讯行业一般的做法就是采用静音检测,
一旦检测为静音或者噪音,则不做处理,反之通过一定的策略进行处理。
这里就涉及到两个算法,一个是静音检测,一个是音频增益。
增益其实没什么好说的,类似于数据归一化拉伸的做法。
静音检测 在WebRTC中 是采用计算GMM (Gaussian Mixture Model,高斯混合模型)进行特征提取的。
在很长一段时间里面,音频特征 有3个主要的方法,
GMM ,Spectrogram (声谱图), MFCC 即 Mel-Frequency Cepstrum(Mel频率倒谱)
恕我直言,GMM 提取的特征,其鲁棒性 不如后两者。
也不多做介绍,感兴趣的同学,翻翻 维基百科 ,补补课。
当然在实际使用算法时,会由此延伸出来一些小技巧。
例如,用静音检测 来做音频裁剪,或者搭配音频增益做一些音频增强之类的操作。
自动增益在WebRTC 源代码文件是:analog_agc.c 和 digital_agc.c
静音检测 源代码文件是: webrtc_vad.c
这个命名,有一定的历史原因了。
经过梳理后,
增益算法为 agc.c agc.h
静音检测为 vad.c vad.h
增益算法的完整示例代码:
#include#include #include //采用https://github.com/mackron/dr_libs/blob/master/dr_wav.h 解码#define DR_WAV_IMPLEMENTATION#include "dr_wav.h"#include "agc.h"#ifndef nullptr#define nullptr 0#endif#ifndef MIN#define MIN(A, B) ((A) < (B) ? (A) : (B))#endif//写wav文件void wavWrite_int16(char *filename, int16_t *buffer, size_t sampleRate, size_t totalSampleCount) { drwav_data_format format = {}; format.container = drwav_container_riff; // <-- drwav_container_riff = normal WAV files, drwav_container_w64 = Sony Wave64. format.format = DR_WAVE_FORMAT_PCM; // <-- Any of the DR_WAVE_FORMAT_* codes. format.channels = 1; format.sampleRate = (drwav_uint32) sampleRate; format.bitsPerSample = 16; drwav *pWav = drwav_open_file_write(filename, &format); if (pWav) { drwav_uint64 samplesWritten = drwav_write(pWav, totalSampleCount, buffer); drwav_uninit(pWav); if (samplesWritten != totalSampleCount) { fprintf(stderr, "ERROR\n"); exit(1); } }}//读取wav文件int16_t *wavRead_int16(char *filename, uint32_t *sampleRate, uint64_t *totalSampleCount) { unsigned int channels; int16_t *buffer = drwav_open_and_read_file_s16(filename, &channels, sampleRate, totalSampleCount); if (buffer == nullptr) { printf("读取wav文件失败."); } //仅仅处理单通道音频 if (channels != 1) { drwav_free(buffer); buffer = nullptr; *sampleRate = 0; *totalSampleCount = 0; } return buffer;}//分割路径函数void splitpath(const char *path, char *drv, char *dir, char *name, char *ext) { const char *end; const char *p; const char *s; if (path[0] && path[1] == ':') { if (drv) { *drv++ = *path++; *drv++ = *path++; *drv = '\0'; } } else if (drv) *drv = '\0'; for (end = path; *end && *end != ':';) end++; for (p = end; p > path && *--p != '\\' && *p != '/';) if (*p == '.') { end = p; break; } if (ext) for (s = end; (*ext = *s++);) ext++; for (p = end; p > path;) if (*--p == '\\' || *p == '/') { p++; break; } if (name) { for (s = p; s < end;) *name++ = *s++; *name = '\0'; } if (dir) { for (s = path; s < p;) *dir++ = *s++; *dir = '\0'; }}int agcProcess(int16_t *buffer, uint32_t sampleRate, size_t samplesCount, int16_t agcMode) { if (buffer == nullptr) return -1; if (samplesCount == 0) return -1; WebRtcAgcConfig agcConfig; agcConfig.compressionGaindB = 9; // default 9 dB agcConfig.limiterEnable = 1; // default kAgcTrue (on) agcConfig.targetLevelDbfs = 3; // default 3 (-3 dBOv) int minLevel = 0; int maxLevel = 255; size_t samples = MIN(160, sampleRate / 100); if (samples == 0) return -1; const int maxSamples = 320; int16_t *input = buffer; size_t nTotal = (samplesCount / samples); void *agcInst = WebRtcAgc_Create(); if (agcInst == NULL) return -1; int status = WebRtcAgc_Init(agcInst, minLevel, maxLevel, agcMode, sampleRate); if (status != 0) { printf("WebRtcAgc_Init fail\n"); WebRtcAgc_Free(agcInst); return -1; } status = WebRtcAgc_set_config(agcInst, agcConfig); if (status != 0) { printf("WebRtcAgc_set_config fail\n"); WebRtcAgc_Free(agcInst); return -1; } size_t num_bands = 1; int inMicLevel, outMicLevel = -1; int16_t out_buffer[maxSamples]; int16_t *out16 = out_buffer; uint8_t saturationWarning = 1; //是否有溢出发生,增益放大以后的最大值超过了65536 int16_t echo = 0; //增益放大是否考虑回声影响 for (int i = 0; i < nTotal; i++) { inMicLevel = 0; int nAgcRet = WebRtcAgc_Process(agcInst, (const int16_t *const *) &input, num_bands, samples, (int16_t *const *) &out16, inMicLevel, &outMicLevel, echo, &saturationWarning); if (nAgcRet != 0) { printf("failed in WebRtcAgc_Process\n"); WebRtcAgc_Free(agcInst); return -1; } memcpy(input, out_buffer, samples * sizeof(int16_t)); input += samples; } WebRtcAgc_Free(agcInst); return 1;}void auto_gain(char *in_file, char *out_file) { //音频采样率 uint32_t sampleRate = 0; //总音频采样数 uint64_t inSampleCount = 0; int16_t *inBuffer = wavRead_int16(in_file, &sampleRate, &inSampleCount); //如果加载成功 if (inBuffer != nullptr) { // kAgcModeAdaptiveAnalog 模拟音量调节 // kAgcModeAdaptiveDigital 自适应增益 // kAgcModeFixedDigital 固定增益 agcProcess(inBuffer, sampleRate, inSampleCount, kAgcModeAdaptiveDigital); wavWrite_int16(out_file, inBuffer, sampleRate, inSampleCount); free(inBuffer); }}int main(int argc, char *argv[]) { printf("WebRTC Automatic Gain Control\n"); printf("博客:http://cpuimage.cnblogs.com/\n"); printf("音频自动增益\n"); if (argc < 2) return -1; char *in_file = argv[1]; char drive[3]; char dir[256]; char fname[256]; char ext[256]; char out_file[1024]; splitpath(in_file, drive, dir, fname, ext); sprintf(out_file, "%s%s%s_out%s", drive, dir, fname, ext); auto_gain(in_file, out_file); printf("按任意键退出程序 \n"); getchar(); return 0;}
静音检测完整示例代码:
#include#include #include //采用https://github.com/mackron/dr_libs/blob/master/dr_wav.h 解码#define DR_WAV_IMPLEMENTATION#include "dr_wav.h"#include "vad.h"#ifndef nullptr#define nullptr 0#endif#ifndef MIN#define MIN(A, B) ((A) < (B) ? (A) : (B))#endif#ifndef MAX#define MAX(A, B) ((A) > (B) ? (A) : (B))#endif//读取wav文件int16_t *wavRead_int16(char *filename, uint32_t *sampleRate, uint64_t *totalSampleCount) { unsigned int channels; int16_t *buffer = drwav_open_and_read_file_s16(filename, &channels, sampleRate, totalSampleCount); if (buffer == nullptr) { printf("读取wav文件失败."); } //仅仅处理单通道音频 if (channels != 1) { drwav_free(buffer); buffer = nullptr; *sampleRate = 0; *totalSampleCount = 0; } return buffer;}int vadProcess(int16_t *buffer, uint32_t sampleRate, size_t samplesCount, int16_t vad_mode, int per_ms_frames) { if (buffer == nullptr) return -1; if (samplesCount == 0) return -1; // kValidRates : 8000, 16000, 32000, 48000 // 10, 20 or 30 ms frames per_ms_frames = MAX(MIN(30, per_ms_frames), 10); size_t samples = sampleRate * per_ms_frames / 1000; if (samples == 0) return -1; int16_t *input = buffer; size_t nTotal = (samplesCount / samples); void *vadInst = WebRtcVad_Create(); if (vadInst == NULL) return -1; int status = WebRtcVad_Init(vadInst); if (status != 0) { printf("WebRtcVad_Init fail\n"); WebRtcVad_Free(vadInst); return -1; } status = WebRtcVad_set_mode(vadInst, vad_mode); if (status != 0) { printf("WebRtcVad_set_mode fail\n"); WebRtcVad_Free(vadInst); return -1; } printf("Activity : \n"); for (int i = 0; i < nTotal; i++) { int nVadRet = WebRtcVad_Process(vadInst, sampleRate, input, samples); if (nVadRet == -1) { printf("failed in WebRtcVad_Process\n"); WebRtcVad_Free(vadInst); return -1; } else { // output result printf(" %d \t", nVadRet); } input += samples; } printf("\n"); WebRtcVad_Free(vadInst); return 1;}void vad(char *in_file) { //音频采样率 uint32_t sampleRate = 0; //总音频采样数 uint64_t inSampleCount = 0; int16_t *inBuffer = wavRead_int16(in_file, &sampleRate, &inSampleCount); //如果加载成功 if (inBuffer != nullptr) { // Aggressiveness mode (0, 1, 2, or 3) int16_t mode = 1; int per_ms = 30; vadProcess(inBuffer, sampleRate, inSampleCount, mode, per_ms); free(inBuffer); }}int main(int argc, char *argv[]) { printf("WebRTC Voice Activity Detector\n"); printf("博客:http://cpuimage.cnblogs.com/\n"); printf("静音检测\n"); if (argc < 2) return -1; char *in_file = argv[1]; vad(in_file); printf("按任意键退出程序 \n"); getchar(); return 0;}
自动增益项目地址:
具体流程为:
加载wav(拖放wav文件到可执行文件上)->增益处理->保存为_out.wav文件
静音检测项目地址:
具体流程为:
加载wav(拖放wav文件到可执行文件上)->输出静音检测结果
备注 :1 为非静音,0 为静音
该注意的地方和参数,见代码注释。
用cmake即可进行编译示例代码,详情见CMakeLists.txt。
若有其他相关问题或者需求也可以邮件联系俺探讨。
邮箱地址是:
gaozhihan@vip.qq.com