Hello there!
Let me explain - my ULTIMATE goal is, on iOS and Windows: to have my Unity app be able to take voices from 2 different audio input devices, send the audio via webRTC, receive another audio from the webRTC, and play it on 2 audio output devices in real time.
I thought of a solution for the inputs part by doing this:
- In the Unity Update() method, get raw data from inputs using lock/unlock
- Merge the 2 raw data byte arrays into one
- Convert this byte array to a float array, with its values normalized between -1 and 1
- Send this float array to an AudioStreamTrack (Unity webRTC lib)
- Send this AudioStreamTrack to my webRTC peer
And for the outputs part:
- Get an AudioStreamTrack from my webRTC peer
- Get the data of the AudioStreamTrack (float array)
- Convert float array to byte array
- At runtime in Update(), play the raw data in 2 different outputs
Currently, I’m working on the inputs part. I’m able to record my voice in runtime and get the raw data. The problem is when I try to listen to this raw data: I can hear my voice but I hear a sequence of 1 second with sound, followed by 1 second without sound, then this loop repeats. There is also a little echo during the 1 second with sound.
I don’t know what part I’m doing wrong, or if the ultimate goal is even possible to achieve by doing what I imagined. My current code was done using lots of forum resources on other topics but I’m not an expert on audio development.
Get microphone’s sound initialization:
exinfo.cbsize = Marshal.SizeOf(typeof(FMOD.CREATESOUNDEXINFO));
exinfo.numchannels = 1;
exinfo.format = FMOD.SOUND_FORMAT.PCM16;
exinfo.defaultfrequency = sampleRate; // For my mic -> 48000
exinfo.length = (uint)sampleRate * sizeof(short);
var mode = FMOD.MODE.LOOP_NORMAL | FMOD.MODE.OPENUSER;
RuntimeManager.CoreSystem.createSound(exinfo.userdata , mode , ref exinfo , out sound);
RuntimeManager.CoreSystem.recordStart(InputDeviceIndex , sound , true);
sound.getLength(out soundLength , FMOD.TIMEUNIT.PCM);
Play sound initialization:
exinfo2.cbsize = Marshal.SizeOf(typeof(FMOD.CREATESOUNDEXINFO));
exinfo2.numchannels = 1;
exinfo2.format = FMOD.SOUND_FORMAT.PCM16;
exinfo2.defaultfrequency = 48000;
exinfo2.length = (uint)sampleRate * sizeof(short);
ERRCHECK(RuntimeManager.CoreSystem.createSound(exinfo2.userdata , FMOD.MODE.LOOP_NORMAL | FMOD.MODE.OPENUSER | FMOD.MODE.OPENRAW , ref exinfo2 , out recvSound));
Get raw data from my microphone:
void Update()
{
time += Time.deltaTime;
if (time >= updateTimeOut)
{
time = 0.0f;
RuntimeManager.CoreSystem.getRecordPosition(InputDeviceIndex , out uint recordPos);
int blocklength;
blocklength = (int)recordPos - lastRecordPos;
if (blocklength < 0)
{
blocklength += (int)soundLength;
}
float[] soundDataFloat;
byte[] sounDataTmp;
// Lock -> get the mic raw data
ERRCHECK(sound.@lock((uint)lastRecordPos , (uint)blocklength , out IntPtr ptr1 , out IntPtr ptr2 , out uint len1 , out uint len2));
byte[] soundData;
if (len1 > 0)
{
if (len2 > 0)
soundData = new byte[len1 + len2];
else
soundData = new byte[len1];
Marshal.Copy(ptr1 , soundData , 0 , (int)len1);
samplePos += (int)len1;
if (len2 > 0)
{
Marshal.Copy(ptr2 , soundData , (int)len1 - 1 , (int)len2);
samplePos += (int)len2;
}
sounDataTmp = soundData;
}
ERRCHECK(sound.unlock(ptr1 , ptr2 , len1 , len2));
Try to listen to the raw data (I have it in the same Update method):
// Lock -> play sound
ERRCHECK(recvSound.@lock((uint)lastRecordPos , (uint)samplePos , out IntPtr recvPtr1 , out IntPtr recvPtr2 , out uint recvLen1 , out uint recvLen2));
if (recvLen1 > 0)
{
Marshal.Copy(sounDataTmp , 0 , recvPtr1 , (int)recvLen1);
samplePos -= (int)recvLen1;
if (recvLen2 > 0)
{
Marshal.Copy(sounDataTmp , (int)recvLen1 - 1 , recvPtr2 , (int)recvLen2);
samplePos-= (int)recvLen2;
}
}
else soundDataFloat = null;
nextPlaybackPos = playbackPos + len1 + len2;
channel.setPaused(false);
ERRCHECK(recvSound.unlock(ptr1 , ptr2 , recvLen1 , recvLen2));
It may be unclear but the ERRCHECK method is just a Debug.Log with the result:
static void ERRCHECK(FMOD.RESULT result)
{
if (result != FMOD.RESULT.OK)
{
Debug.Log("FMOD_Unity: FmodEventImportPostProcessor: " + result + " - " + FMOD.Error.String(result));
}
}
I also prepared some methods that I cannot currently test, but they will be useful if I pass the first step.
Methods to convert my byte array to a float array:
// 16 bits -> format = FMOD.SOUND_FORMAT.PCM16
private readonly float min = -32768f;
private readonly float max = 32767;
public float[] Convert(byte[] audio)
{
List<float> converted = new List<float>();
for (int i = 0; i < audio.Length; i += 2)
{
short value = BitConverter.ToInt16(audio , i);
converted.Add(normalize(value));
}
return converted.ToArray();
}
// normalize between -1 and 1
float normalize(float input)
{
return 2f * ((input - min) / (max - min)) - 1f;
}
And a method to merge 2 raw data array to 1:
public unsafe byte[] Aggregation(byte[] audio1 , byte[] audio2)
{
byte[] fusion;
if (audio1.Length > audio2.Length)
fusion = new byte[audio1.Length];
else
fusion = new byte[audio2.Length];
fixed (byte* b1Ptr = audio1.ToArray())
{
fixed (byte* b2Ptr = audio2.ToArray())
{
fixed (byte* rPtr = fusion)
{
var s1Ptr = (short*)b1Ptr;
var s2Ptr = (short*)b2Ptr;
var srPtr = (short*)rPtr;
var length = fusion.Length / 2;
for (int i = 0; i < length; i++)
{
float v;
if (s1Ptr[i] != 0 && s2Ptr[i] != 0)
v = (s1Ptr[i] + s2Ptr[i]) / 2f;
else
{
if (s1Ptr[i] != 0)
v = s1Ptr[i];
else
v = s2Ptr[i];
}
srPtr[i] = (short)v;
}
}
}
}
return fusion;
}
I hope my explanations were clear enough, otherwise please don’t hesitate to ask for more info. Thanks everyone!