#include <stdlib.h>
#include <crtdbg.h>
#include <math.h>

#include "VBitmap.h"
#include "Error.h"
#include "mmx.h"

#include "VideoTelecineRemover.h"

class CVideoTelecineRemover: public VideoTelecineRemover {
public:
	CVideoTelecineRemover(VBitmap *pInFormat);
	~CVideoTelecineRemover();

	void ProcessIn(VBitmap *pIn, long);
	long ProcessOut(VBitmap *pOut);

private:
	char *	pMemBlock;
	long	nCombVals[10][2];
	long	lFrameNums[10];
	VBitmap	vb;
	int		nCurrentIn, nCurrentOut;
	int		nCombOffset1, nCombOffset2;
	int		nNewCombOffset1, nNewCombOffset2;
	int		nLag;
	bool	fInvertPolarity, fNewPolarity, fDropMode, fNewDropMode;
};

VideoTelecineRemover *CreateVideoTelecineRemover(VBitmap *pInFormat) {
	return new CVideoTelecineRemover(pInFormat);
}


CVideoTelecineRemover::CVideoTelecineRemover(VBitmap *pFormat) {
	vb = *pFormat;

	if (!(pMemBlock = new char[pFormat->pitch * pFormat->h * 10]))
		throw MyMemoryError();

	nCurrentIn = 0;
	nCurrentOut = 9;

	nCombOffset1 = nNewCombOffset1 = -1;
	nCombOffset2 = nNewCombOffset2 = -1;
	fDropMode = fNewDropMode = true;

	memset(nCombVals, 0, sizeof nCombVals);

	nLag = 10;
}

VideoTelecineRemover::~VideoTelecineRemover() {
}

CVideoTelecineRemover::~CVideoTelecineRemover() {
	delete[] pMemBlock;
}

static inline int sq(int d) {
	return d*d;
}

static long __declspec(naked) computeScanImprovementMMX(Pixel8 *src1, Pixel8 *src2, PixOffset pitch, PixDim w) {
	__asm {
		push		ebx

		mov			eax,[esp+8]
		mov			ecx,[esp+16]
		mov			edx,[esp+12]
		mov			ebx,[esp+20]

		pxor		mm4,mm4
xloop:
		movd		mm0,[eax]
		pxor		mm7,mm7

		movd		mm1,[eax+ecx*2]
		punpcklbw	mm0,mm7

		movd		mm2,[eax+ecx]
		punpcklbw	mm1,mm7

		movd		mm3,[edx]
		punpcklbw	mm2,mm7			;mm2 = pB

		paddw		mm0,mm1			;mm0 = pA + pC
		paddw		mm2,mm2			;mm2 = 2*pB

		punpcklbw	mm3,mm7			;mm3 = pD
		psubw		mm2,mm0			;mm2 = 2*pB - (pA+pC)

		paddw		mm3,mm3			;mm3 = 2*pD
		pmaddwd		mm2,mm2			;mm2 = sq(2*pB - (pA+pC))

		psubw		mm3,mm0			;mm3 = 2*pD - (pA+pC)
		add			eax,4

		pmaddwd		mm3,mm3			;mm3 = sq(2*pD - (pA+pC))
		add			edx,4

		paddd		mm4,mm2
		dec			ebx

		;
		;

		psubd		mm4,mm3
		jne			xloop

		movd		eax,mm4
		psrlq		mm4,32
		movd		ecx,mm4
		add			eax,ecx

		pop			ebx
		ret
	}
}

static long computeScanImprovement(Pixel8 *src1, Pixel8 *src2, PixOffset pitch, PixDim w) {
	long imp = 0;

	// I have no clue why this works.  It's supposed to compute the 'improvement'
	// from reinterleaving the fields, by comparing the pixel differences between
	// a field and the interpolated line in the other field.  Fortunately, when I
	// coded the algorithm, I accidentally used src2[n] instead of src2[pitch+n],
	// which is off by one scanline.  The way it's supposed to work is that it
	// looks for combing in the old image, and an improvement after the recombine;
	// the offset that reduces visual combing the most is chosen, and any errors
	// occur when the effect is less likely to be noticed.
	//
	// The broken algorithm far outperforms the intended one and so I'm not touching
	// it.

	w = -w;
	do {
		int rA = src1[0];
		int gA = src1[1];
		int bA = src1[2];
		int rB = src1[0+pitch];
		int gB = src1[1+pitch];
		int bB = src1[2+pitch];
		int rC = src1[0+pitch*2];
		int gC = src1[1+pitch*2];
		int bC = src1[2+pitch*2];
		int rD = src2[0];
		int gD = src2[1];
		int bD = src2[2];

		imp += sq(rA + rC - 2*rB) + sq(gA + gC - 2*gB) + sq(bA + bC - 2*bB)
			- sq(rA + rC - 2*rD) - sq(gA + gC - 2*gD) - sq(bA + bC - 2*bD);

		src1 += 4;
		src2 += 4;
	} while(++w);

	return imp;
}

void CVideoTelecineRemover::ProcessIn(VBitmap *pIn, long lFrameNum) {
	Pixel8 *src1, *src2;
	PixDim h, w;
	__int64 field1=0, field2=0;

	vb.data = (Pixel *)(pMemBlock + vb.pitch*vb.h * nCurrentIn);

	vb.BitBlt(0, 0, pIn, 0, 0, -1, -1);

	lFrameNums[nCurrentIn] = lFrameNum;

	{
		h = (vb.h-2)/2;
		src1 = (Pixel8 *)(pMemBlock + vb.pitch*vb.h * nCurrentIn);
		src2 = (Pixel8 *)(pMemBlock + vb.pitch*vb.h * ((nCurrentIn+9)%10));

		if (MMX_enabled) {
			do {
				field1 += computeScanImprovementMMX(src1, src2, vb.pitch, vb.w);
				field2 += computeScanImprovementMMX(src1+vb.pitch, src2+vb.pitch, vb.pitch, vb.w);

				src1 += vb.pitch*2;
				src2 += vb.pitch*2;
			} while(--h);

			__asm emms
		} else {
			do {
				field1 += computeScanImprovement(src1, src2, vb.pitch, vb.w);
				field2 += computeScanImprovement(src1+vb.pitch, src2+vb.pitch, vb.pitch, vb.w);

				src1 += vb.pitch*2;
				src2 += vb.pitch*2;
			} while(--h);
		}
	}

	if (field1 < 0)
		field1 = 0;

	if (field2 < 0)
		field2 = 0;

	_RPT2(0,"%16d %16d\n", (long)sqrt(field1), (long)sqrt(field2));

	nCombVals[nCurrentIn][0] = (long)sqrt(field1);
	nCombVals[nCurrentIn][1] = (long)sqrt(field2);

	if (++nCurrentIn == 10)
		nCurrentIn = 0;

	if (nCurrentIn == 0 || nCurrentIn == 5) {
		int i;
		long best_score = 0;
		int best_offset = -1;
		bool best_polarity = false;

		for(i=0; i<5; i++) {
			long v1 = nCombVals[(nCurrentIn+4+i)%10][0];
			long v2 = nCombVals[(nCurrentIn+4+i)%10][1];

			if (v1 > best_score) {
				best_offset = (i+4)%5;
				best_polarity = true;
				best_score = v1;
			}

			if (v2 > best_score) {
				best_offset = (i+4)%5;
				best_polarity = false;
				best_score = v2;
			}
		}

		_RPT4(0,"----------- %d %d [%d %d]\n", best_offset, best_polarity, nNewCombOffset1, fNewPolarity);

		fDropMode = fNewDropMode;
		nCombOffset1 = nNewCombOffset1;
		nCombOffset2 = nNewCombOffset2;
		fInvertPolarity = fNewPolarity;

		if (best_offset == -1) {
			fNewDropMode = true;
			nNewCombOffset1 = 0;
			nNewCombOffset2 = 1;
		} else {
			fNewDropMode = false;
			nNewCombOffset1 = best_offset;
			nNewCombOffset2 = (best_offset+1) % 5;
			fNewPolarity = best_polarity;
		}
	}
}

long CVideoTelecineRemover::ProcessOut(VBitmap *pOut) {

	if (++nCurrentOut >= 10)
		nCurrentOut = 0;

	if (nLag) {
		--nLag;
		return -1;
	}

	// Input frames:	[A1/A2] [A1/B2] [B1/C2] [C1/C2] [D1/D2]
	// Action:			copy    decomb  drop    copy    copy

	if ((nCurrentOut == nCombOffset1 || nCurrentOut == nCombOffset1+5) && !fDropMode) {
		// First combed frame; reconstruct.

		VBitmap vb_in, vb_out;

		// Copy bottom field.

		vb_in			= vb;
		vb_in.data		= (Pixel *)(pMemBlock + vb.pitch * (vb.h*((nCurrentOut+(fInvertPolarity?1:0))%10) + (vb.h & 1)));
		vb_in.modulo	+= vb.pitch;
		vb_in.pitch		*= 2;
		vb_in.h			= (vb_in.h + 1)/2;

		vb_out			= *pOut;
		vb_out.modulo	+= vb_out.pitch;
		vb_out.pitch	*= 2;
		vb_out.h		= (vb_out.h + 1)/2;

		vb_out.BitBlt(0, 0, &vb_in, 0, 0, -1, -1);
//		vb_out.RectFill(0, 0, -1, -1, 0x00ff00);

		// Copy top field.

		vb_in.data		= (Pixel *)(pMemBlock + vb.pitch * (vb.h*((nCurrentOut+(fInvertPolarity?0:1))%10) + 1 - (vb.h & 1)));
		vb_in.h			= vb.h/2;

		vb_out.data		= (Pixel *)((char *)pOut->data + pOut->pitch*(1-(pOut->h & 1)));
		vb_out.h		= pOut->h/2;

		vb_out.BitBlt(0, 0, &vb_in, 0, 0, -1, -1);
//		vb_out.RectFill(0, 0, -1, -1, 0xff0000);

		return lFrameNums[nCurrentOut];

	} else if (nCurrentOut == nCombOffset2 || nCurrentOut == nCombOffset2+5) {
		// Second combed frame; drop.
		return -1;
	} else {
		// Uncombed, unduplicated frame.

		vb.data = (Pixel *)(pMemBlock + vb.pitch*vb.h * nCurrentOut);

		pOut->BitBlt(0, 0, &vb, 0, 0, -1, -1);

		return lFrameNums[nCurrentOut];
	}
}
