Explore an advanced Python project that combines audio transcription and text-to-speech synthesis using state-of-the-art tools like Librosa, PyTorch, and Hugging Face's Transformers library. This script demonstrates how to load and resample audio files, transcribe speech to text using Facebook's Wav2Vec2 model, and convert text back to speech with customizable voice options using pyttsx3. Perfect for anyone interested in speech processing, AI-driven voice technology, or natural language processing projects. Ideal for enhancing your Python skills and diving into real-world applications of AI in audio analysis.
import librosa
from scipy.signal import resample
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer
import pyttsx3
from scipy.signal import resample
# Load audio file
audio_file = "directory of audio file"
audio, sr = librosa.load(audio_file, sr=None)
def resample_audio(audio, orig_sr, target_sr):
duration = audio.shape[0] / orig_sr
target_length = int(duration * target_sr)
resampled_audio = resample(audio, target_length)
return resampled_audio
# Example usage:
# resampled_audio = resample_audio(audio, 48000, 16000)
# Resample if necessary
if sr != 16000:
audio = resample_audio(audio, sr, 16000)
sr = 16000
print("Audio loaded and resampled successfully.")
# Load Wav2Vec2 model and tokenizer
tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
print("Model loaded successfully.")
# Tokenize input
input_values = tokenizer(audio, return_tensors="pt").input_values
# Perform inference
with torch.no_grad():
logits = model(input_values).logits
# Get predicted ids
predicted_ids = torch.argmax(logits, dim=-1)
# Decode the ids to text
transcription = tokenizer.batch_decode(predicted_ids)[0]
print("Transcription: ", transcription)
# Text-to-Speech
def text_to_speech(text, voice_gender='female', rate=150):
engine = pyttsx3.init()
voices = engine.getProperty('voices')
if voice_gender == 'male':
engine.setProperty('voice', voices[0].id)
else:
engine.setProperty('voice', voices[1].id)
engine.setProperty('rate', rate)
engine.say(text)
engine.runAndWait()
# Example usage
long_text = "hi what happened"
text_to_speech(long_text, voice_gender='male', rate=150) # For male voice
text_to_speech(long_text, voice_gender='female', rate=180) # For female voice
print("Text-to-Speech conversion completed.")
#AudioTranscription
#TextToSpeech
#Wav2Vec2
#PyTorch
#Librosa
#NLP
#SpeechRecognition
#VoiceSynthesis
#AIinPython
#NaturalLanguageProcessing
#SpeechToText
#Pyttsx3
#MachineLearning
#DeepLearning
#AudioProcessing
#PythonAI
#TransformersLibrary
#PythonCoding
#PythonTutorial
Comments