Python Project: Audio Transcription and Text-to-Speech Conversion Using Wav2Vec2 and Pyttsx3

 Explore an advanced Python project that combines audio transcription and text-to-speech synthesis using state-of-the-art tools like Librosa, PyTorch, and Hugging Face's Transformers library. This script demonstrates how to load and resample audio files, transcribe speech to text using Facebook's Wav2Vec2 model, and convert text back to speech with customizable voice options using pyttsx3. Perfect for anyone interested in speech processing, AI-driven voice technology, or natural language processing projects. Ideal for enhancing your Python skills and diving into real-world applications of AI in audio analysis.

import librosa

from scipy.signal import resample

import torch

from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer

import pyttsx3

# Load audio file

audio_file = "directory of audio file"

audio, sr = librosa.load(audio_file, sr=None)

def resample_audio(audio, orig_sr, target_sr):

    duration = audio.shape[0] / orig_sr

    target_length = int(duration * target_sr)

    resampled_audio = resample(audio, target_length)

    return resampled_audio

# Example usage:

# resampled_audio = resample_audio(audio, 48000, 16000)

# Resample if necessary

if sr != 16000:

    audio = resample_audio(audio, sr, 16000)

    sr = 16000

print("Audio loaded and resampled successfully.")

# Load Wav2Vec2 model and tokenizer

tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")

model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")

print("Model loaded successfully.")

# Tokenize input

input_values = tokenizer(audio, return_tensors="pt").input_values

# Perform inference

with torch.no_grad():

    logits = model(input_values).logits

# Get predicted ids

predicted_ids = torch.argmax(logits, dim=-1)

# Decode the ids to text

transcription = tokenizer.batch_decode(predicted_ids)[0]

print("Transcription: ", transcription)

# Text-to-Speech

def text_to_speech(text, voice_gender='female', rate=150):

    engine = pyttsx3.init()

    voices = engine.getProperty('voices')


    if voice_gender == 'male':

        engine.setProperty('voice', voices[0].id)


        engine.setProperty('voice', voices[1].id)


    engine.setProperty('rate', rate)



# Example usage

long_text = "hi what happened"

text_to_speech(long_text, voice_gender='male', rate=150)  # For male voice

text_to_speech(long_text, voice_gender='female', rate=180)  # For female voice

print("Text-to-Speech conversion completed.")



