Practice on libx264

H.264 or MPEG4 Part 10 is very popular, and libx264 is the best library.

Before we start encoding sequential images into a short movie,  we need following devtools.

  1. a c/c++ compiler, for example, VS2013 I used. you can get a copy from here: https://www.visualstudio.com/en-us/news/vs2013-community-vs.aspx
  2. yasm,  an assembler to compile codes in assmbly.  you can get a copy from here: http://yasm.tortall.net/Download.html
  3. cmake, a nice build system,  you can get it from here: https://cmake.org/download/
  4. libx264 source code, last but not the least, you need to grab the libx264 source code from here: http://www.videolan.org/developers/x264.html
  5. to save config time, you need 2 config file from here: https://github.com/ShiftMediaProject/x264/tree/master/SMP, get config.h,x264_config.h, save them in x264 source code root folder

Ok, here we go.

cmake_minimum_required(VERSION 3.1)

project(LIBX264)

INCLUDE_DIRECTORIES(“${CMAKE_CURRENT_SOURCE_DIR}”)

ADD_DEFINITIONS(-D_CRT_SECURE_NO_WARNINGS)

set(C_SOURCES
common/mc.c common/predict.c common/pixel.c common/macroblock.c
common/frame.c common/dct.c common/cpu.c common/cabac.c
common/common.c common/osdep.c common/rectangle.c
common/set.c common/quant.c common/deblock.c common/vlc.c
common/mvpred.c common/bitstream.c
encoder/analyse.c encoder/me.c encoder/ratecontrol.c
encoder/set.c encoder/macroblock.c encoder/cabac.c
encoder/cavlc.c encoder/encoder.c encoder/lookahead.c
common/win32thread.c
common/threadpool.c
common/x86/mc-c.c common/x86/predict-c.c
input/input.c input/raw.c
output/flv.c output/flv_bytestream.c
output/matroska.c output/matroska_ebml.c
output/raw.c
)

find_program(YASM_EXE NAMES yasm)

# Header
#x86inc.asm
#x86util.asm

# shared
set(ASM_SOURCES
bitstream-a.asm
cabac-a.asm
const-a.asm
cpu-a.asm
dct-a.asm
deblock-a.asm
mc-a.asm
mc-a2.asm
pixel-a.asm
predict-a.asm
quant-a.asm
sad-a.asm)

# x86 specified
if(CMAKE_SIZEOF_VOID_P EQUAL 4)
list(APPEND ASM_SOURCES
pixel-32.asm
dct-32.asm)
endif()

# x64 specified
if(CMAKE_SIZEOF_VOID_P EQUAL 8)
list(APPEND ASM_SOURCES
dct-64.asm
trellis-64.asm)
endif()

foreach(asm_file ${ASM_SOURCES})
SET(outfile “${CMAKE_CURRENT_BINARY_DIR}/${asm_file}.obj”)
SET(infile “${CMAKE_CURRENT_SOURCE_DIR}/common/x86/${asm_file}”)

if(CMAKE_SIZEOF_VOID_P EQUAL 8)
add_custom_command(OUTPUT ${outfile} COMMAND ${YASM_EXE} ARGS -Xvc -f win64 -d STACK_ALIGNMENT=16 -d HIGH_BIT_DEPTH=0 -d BIT_DEPTH=8 -d WIN32=1 -d ARCH_X86_64=1 -i “${CMAKE_CURRENT_SOURCE_DIR}/common/x86” -o ${outfile} ${infile})
else()
add_custom_command(OUTPUT ${outfile} COMMAND ${YASM_EXE} ARGS -Xvc -f win32 -d PREFIX -d STACK_ALIGNMENT=4 -d HIGH_BIT_DEPTH=0 -d BIT_DEPTH=8 -d WIN32=1 -d ARCH_X86_64=0 -i “${CMAKE_CURRENT_SOURCE_DIR}/common/x86” -o ${outfile} ${infile})
endif()

list(APPEND C_SOURCES ${outfile})
endforeach()

add_library(libx264 STATIC ${C_SOURCES})

This is the cmakelists.txt, the build file for x264, we will build it as a static library.

let me explain a bit on this build file.

  1. locate yasm, and save its path
  2. collect all we required asm files, and build them with yasm, some of them are platform specific, handle them according the size of a pointer
  3. collect c files, and add the obj files generated from asm files, build them as a static library.

As a practice, screen capture is a good lesson.

//==================== gdigrab.h  ========================

#pragma once

class GdiGrab
{
int x_ = 0, y_ = 0;
int cx_ = 0, cy_ = 0;
HBITMAP dib_section_ = nullptr;
BYTE* bitmap_ = nullptr;

public:
GdiGrab();
~GdiGrab();

bool Init(int cx,int cy,int x=0, int y=0);
BYTE* Data(){ return bitmap_; }
uint32_t Size() { return cx_ * cy_ * 4; }
void Refresh();
};

//==================== gdigrab.cpp ========================

#include “stdafx.h”
#include “gdigrab.h”

GdiGrab::GdiGrab()
{
}

GdiGrab::~GdiGrab()
{
if (dib_section_)
{
DeleteObject(dib_section_);
dib_section_ = nullptr;
}
}

bool GdiGrab::Init(int cx, int cy, int x, int y)
{
cx_ = cx;
cy_ = cy;
x_ = x;
y_ = y;

BITMAPINFOHEADER bmp = { 0 };
bmp.biSize = sizeof(bmp);
bmp.biWidth = cx_;
bmp.biHeight = -cy_;
bmp.biPlanes = 1;
bmp.biBitCount = 32;
bmp.biCompression = BI_RGB;
dib_section_ = CreateDIBSection(NULL, (BITMAPINFO*)&bmp, DIB_RGB_COLORS, (void**)&bitmap_, nullptr, 0);
return nullptr != dib_section_;
}

void GdiGrab::Refresh()
{
HDC hdc = GetDC(0);
HDC tempdc = CreateCompatibleDC(hdc);
HBITMAP orig_bitmap = (HBITMAP)SelectObject(tempdc, dib_section_);
BitBlt(tempdc, 0, 0, cx_, cy_, hdc, x_, y_, SRCCOPY);
SelectObject(tempdc, orig_bitmap);
DeleteDC(tempdc);
ReleaseDC(0,hdc);
}

We use windows GDI DibSection to store captured image from a selected area of desktop.

One of the tricky thing is, Windows store images from bottom to top, while other softwares handle images from top to bottom. So we use a negative height to tell windows GDI subsystem to store image in reversed order.

Another one is, to use 32bit DIB section to avoid worrying 32bit alignment.

we can instantiate a gdigrab,  and refresh periodically .we can capture screen like this.

GdiGrab gdigrab;
gdigrab.Init(width, height);

gdigrab.Refresh(); // call this periodically

// gdigrab.Data() is the RGBA data array.

Before we can call x264 to encode the images, we have to pay attention to the colorspace.

The image data in DIB section is in  RGB color space, the format is RGBA, while A is actually fixed to 255. due to Intel is little endian. each 4 bytes are(from low to high), Blue,Green,Red,No use. (read here if you want to know more https://en.wikipedia.org/wiki/RGBA_color_space)

We use YUV color space, and 420P format when we encodes in 264. That is,  denotes Y(brightness,luna) every  pixel, U/V(chrominance) every 4 pixels. you can get more information about YUV https://en.wikipedia.org/wiki/YUV. the P after 420P means storing the three planes separately. Y of all pixels comes first, the next is U, and the last is V.

we can use picture management util function x264_picture_alloc to initialize the internal buffer for 420P.

auto p = new x264_picture_t; // new a management object

x264_picture_alloc(p, X264_CSP_I420, width, height); // initialize it

to convert an image from RGB32 to YUV420P, we do like this:

inline void rgb32_to_yuv420p(uint8_t *rgb, uint8_t* y, uint8_t* u, uint8_t* v, uint32_t width, uint32_t height)
{
for (int j = 0; j < height; j++)
{
for (int k = 0; k < width; k++)
{
BYTE sR = (BYTE)(rgb[2]);
BYTE sG = (BYTE)(rgb[1]);
BYTE sB = (BYTE)(rgb[0]);

*y = (BYTE)((66 * sR + 129 * sG + 25 * sB + 128) >> 8) + 16;

if (0 == j % 2 && 0 == k % 2)
{
*u = (BYTE)((-38 * sR – 74 * sG + 112 * sB + 128) >> 8) + 128;
*v = (BYTE)((112 * sR – 94 * sG – 18 * sB + 128) >> 8) + 128;

u++;
v++;
}
y++;
rgb += 4; // 4 bytes
}
}
}

According the code, we calculate the value of brightness of each pixel, and calculate U/V only when both row and line are a multiplier of 2.

We use integers instead of float for better performance.

Well we have the images in YUV420P now. ok to start encoding in h264.

x264_param_t param;
x264_picture_t pic_out;
std::shared_ptr<x264_t> h;
x264_nal_t *nal;
int i_nal;

x264_param_default_preset(&param, “veryfast”, “”);
param.i_threads = 1;
param.i_width = width;
param.i_height = height;
param.i_fps_num = 25;
param.i_fps_den = 1;
param.i_keyint_max = 1 * param.i_fps_num / param.i_fps_den;
param.rc.i_bitrate = 1024 * 4;
param.i_timebase_den = param.i_fps_num;
param.i_timebase_num = param.i_fps_den;

// .. other code

// forloop

// capture through gdigrab

// encode with x264

 

We can use util functions inside x264 to write output in mkv,flv, etc. however, if we do like this, we will see the generated movie goes really fast when it’s played back.

The reasons are:

  1. gdi capture takes time, it’s about 70~80ms per cycle, however, we need to encode frames every 40ms(25fps)
  2. encoder also takes time.

because we do gdi capture and encoder in the same thread, due to the total time cost, we cannot generate enough frames in each seconds. as the result, we see the time goes faster than physical world due to less frames was encoded.

To solve this we need a helper class to put the capture logic and encoding logic into 2 threads.

//============= dualbuffersource.h ===============

template<class Picture>
class DualBufferSource
{
std::mutex lock_;
std::deque<Picture> ready_;
std::list<Picture> backlog_;
Picture in_use_ = nullptr;

std::unique_ptr<std::thread> thread_;
volatile bool stop_ = false;

int64_t interval_ = 40 * 1000ll;
int64_t prev_tick_ = GetTickCountUs(), next_tick_ = GetTickCountUs();

public:
DualBufferSource(std::function<Picture(void)> pfncreate, std::function<bool(Picture)> pfnrefresh, int64_t interval)
{
interval_ = interval;
thread_.reset(new std::thread([pfncreate, pfnrefresh, this]{
while (!stop_)
{
auto tick = GetTickCountUs();
if (tick >= next_tick_)
{
printf(“\t\t\tGDIGrab escaped %d us\n”, tick – prev_tick_); // for debug purpose
next_tick_ += interval_;
prev_tick_ = tick;

lock_.lock();
Picture p;
if (ready_.size() >= 3)
{
p = ready_.front();
ready_.pop_front();
}
else if (backlog_.size())
{
p = backlog_.front();
if (p != in_use_)
{
backlog_.pop_front();
}
else
{
p = nullptr;
}
}
lock_.unlock();
if (nullptr == p)
{
p = pfncreate();
}
if (pfnrefresh(p))
{
lock_.lock();
ready_.push_back(p);
lock_.unlock();
}
else
{
lock_.lock();
backlog_.push_back(p);
lock_.unlock();
}
}
else if (GetTickCountUs() < tick + 1000 && interval_ >= 2000)
{
auto slept_us = GetTickCountUs();
Sleep(1);
printf(“\t\t\t\tslept_us %d us\n”, GetTickCountUs() – slept_us);
}
}
}));
}
~DualBufferSource()
{
stop_ = true;
thread_->join();
}

Picture Get(bool& need_release)
{
while (1)
{
{
std::lock_guard<std::mutex> l(lock_);
// skip unwanted frame
while (ready_.size() > 1)
{
auto p = ready_.front();
ready_.pop_front();
backlog_.push_back(p);
}
if (ready_.size())
{
auto p = ready_.front();
ready_.pop_front();

need_release = true;
return in_use_ = p;
}
else if (in_use_ != nullptr)
{
need_release = false;
return in_use_;
}
}
std::this_thread::sleep_for(std::chrono::milliseconds(1));
}
}
void Release(Picture p)
{
std::lock_guard<std::mutex> l(lock_);
backlog_.push_back(p);
}
void WaitForReady()
{
bool need_release = false;
auto p = Get(need_release);
if (need_release)
{
Release(p);
}
}
};

and we can use it like this:

auto source = std::make_shared<DualBufferSource<std::shared_ptr<x264_picture_t>>>([&param, &gdigrab]{
auto p = std::shared_ptr<x264_picture_t>(new x264_picture_t, [](x264_picture_t* p)
{
x264_picture_clean(p);
delete p;
});
x264_picture_alloc(p.get(), param.i_csp, param.i_width, param.i_height);
return p;
}, [&param,&gdigrab](std::shared_ptr<x264_picture_t> p) -> bool{
gdigrab.Refresh();
rgb32_to_yuv420p(gdigrab.Data(),
p->img.plane[0], p->img.plane[1], p->img.plane[2], param.i_width, param.i_height);
return true;
}, /*interval*/1000/16); // use lower fps for capture logic

It’s using C++ lambda, it contains 2 functions, one for creating a new buffer, one for capturing, the last parameter is used to control the frequency of capture.

Our encode loop becomes this after adding dual buffer.

/* ticks/frame = ticks/second / frames/second */
int ticks_per_frame = (int64_t)param.i_timebase_den * param.i_fps_den / param.i_timebase_num / param.i_fps_num;
ticks_per_frame = X264_MAX(ticks_per_frame, 1);

int64_t largest_pts = -1;
int64_t second_largest_pts = -1;
auto global_start = GetTickCount();
int64_t prev_tick = GetTickCountUs(), next_tick = GetTickCountUs();
/* Encode frames */
for (int i_frame = 0; i_frame < frames;)
{
auto tick = GetTickCountUs();
if (tick >= next_tick)
{
printf(“Encoder escaped %d us\n”, tick – prev_tick); // for debug purpose
next_tick += interval;
prev_tick = tick;

bool need_release = false;
auto p = source->Get(need_release);

p->i_pts = largest_pts + ticks_per_frame;
int i_frame_size = x264_encoder_encode(h.get(), &nal, &i_nal, p.get(), &pic_out);

second_largest_pts = largest_pts;
largest_pts = p->i_pts;

if (need_release)
{
source->Release(p);
}

if (i_frame_size < 0)
goto fail;
else if (i_frame_size)
{
if (!FileWriter.write_frame(fout, nal->p_payload, i_frame_size, &pic_out))
goto fail;
}

i_frame++;
}
else if (GetTickCountUs() < tick + 1000 && interval >= 2000)
{
auto slept_us = GetTickCountUs();
std::this_thread::sleep_for(std::chrono::milliseconds(1));
printf(“\t\tslept_us %d us\n”, GetTickCountUs() – slept_us);
}
}
printf(“total escaped %d\n”, GetTickCount() – global_start);
/* Flush delayed frames */
while (x264_encoder_delayed_frames(h.get()))
{
int i_frame_size = x264_encoder_encode(h.get(), &nal, &i_nal, NULL, &pic_out);
if (i_frame_size < 0)
goto fail;
else if (i_frame_size)
{
if (!FileWriter.write_frame(fout, nal->p_payload, i_frame_size, &pic_out))
goto fail;
}
}

It’s simple to build this test executable file using cmake.

SET(TARGET_NAME test_x264)

include_directories(“${CMAKE_CURRENT_SOURCE_DIR}/../x264”)

FILE(GLOB_RECURSE SOURCES “${CMAKE_CURRENT_SOURCE_DIR}/*.cpp” “${CMAKE_CURRENT_SOURCE_DIR}/*.c”)

add_executable(${TARGET_NAME} ${SOURCES})

target_link_libraries(${TARGET_NAME} libx264)

INSTALL(TARGETS ${TARGET_NAME} DESTINATION .)

It includes x264 folder, link itself to libx264, that’s it.

the 32bit release build is  849KB, no dependence.

 

 

Advertisements

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s