#pragma once
#include"../modint/simd-montgomery.hpp"constexprintSZ_FFT_BUF=1<<23;uint32_tbuf1_[SZ_FFT_BUF]__attribute__((aligned(64)));uint32_tbuf2_[SZ_FFT_BUF]__attribute__((aligned(64)));template<typenamemint>structNTT{staticconstexpruint32_tget_pr(){uint32_tmod=mint::get_mod();usingu64=uint64_t;u64ds[32]={};intidx=0;u64m=mod-1;for(u64i=2;i*i<=m;++i){if(m%i==0){ds[idx++]=i;while(m%i==0)m/=i;}}if(m!=1)ds[idx++]=m;uint32_tpr=2;while(1){intflg=1;for(inti=0;i<idx;++i){u64a=pr,b=(mod-1)/ds[i],r=1;while(b){if(b&1)r=r*a%mod;a=a*a%mod;b>>=1;}if(r==1){flg=0;break;}}if(flg==1)break;++pr;}returnpr;};staticconstexpruint32_tmod=mint::get_mod();staticconstexpruint32_tpr=get_pr();staticconstexprintlevel=__builtin_ctzll(mod-1);mintdw[level],dy[level];mint*buf1,*buf2;NTT(){setwy(level);buf1=reinterpret_cast<mint*>(::buf1_);buf2=reinterpret_cast<mint*>(::buf2_);}constexprvoidsetwy(intk){mintw[level],y[level];w[k-1]=mint(pr).pow((mod-1)/(1<<k));y[k-1]=w[k-1].inverse();for(inti=k-2;i>0;--i)w[i]=w[i+1]*w[i+1],y[i]=y[i+1]*y[i+1];dw[1]=w[1],dy[1]=y[1],dw[2]=w[2],dy[2]=y[2];for(inti=3;i<k;++i){dw[i]=dw[i-1]*y[i-2]*w[i];dy[i]=dy[i-1]*w[i-2]*y[i];}}__attribute__((target("sse4.2")))voidntt(mint*a,intn){intk=n?__builtin_ctz(n):0;if(k==0)return;if(k==1){minta1=a[1];a[1]=a[0]-a[1];a[0]=a[0]+a1;return;}if(k&1){intv=1<<(k-1);for(intj=0;j<v;++j){mintajv=a[j+v];a[j+v]=a[j]-ajv;a[j]+=ajv;}}intu=1<<(2+(k&1));intv=1<<(k-2-(k&1));mintone=mint(1);mintimag=dw[1];const__m128im0=_mm_set1_epi32(0);const__m128im1=_mm_set1_epi32(mod);const__m128im2=_mm_set1_epi32(mod+mod);const__m128ir=_mm_set1_epi32(mint::r);const__m128iImag=_mm_set1_epi32(imag.a);while(v){if(v==1){mintww=one,xx=one,wx=one;for(intjh=0;jh<u;){ww=xx*xx,wx=ww*xx;mintt0=a[jh+0],t1=a[jh+1]*xx;mintt2=a[jh+2]*ww,t3=a[jh+3]*wx;mintt0p2=t0+t2,t1p3=t1+t3;mintt0m2=t0-t2,t1m3=(t1-t3)*imag;a[jh+0]=t0p2+t1p3,a[jh+1]=t0p2-t1p3;a[jh+2]=t0m2+t1m3,a[jh+3]=t0m2-t1m3;xx*=dw[__builtin_ctz((jh+=4))];}}else{mintww=one,xx=one,wx=one;for(intjh=0;jh<u;){if(jh==0){intj0=0;intj1=v;intj2=j1+v;intj3=j2+v;intje=v;for(;j0<je;j0+=4,j1+=4,j2+=4,j3+=4){__m128iT0=_mm_loadu_si128((__m128i*)(a+j0));__m128iT1=_mm_loadu_si128((__m128i*)(a+j1));__m128iT2=_mm_loadu_si128((__m128i*)(a+j2));__m128iT3=_mm_loadu_si128((__m128i*)(a+j3));__m128iT0P2=montgomery_add_128(T0,T2,m2,m0);__m128iT1P3=montgomery_add_128(T1,T3,m2,m0);__m128iT0M2=montgomery_sub_128(T0,T2,m2,m0);__m128iT1M3=montgomery_mul_128(montgomery_sub_128(T1,T3,m2,m0),Imag,r,m1);_mm_storeu_si128((__m128i*)(a+j0),montgomery_add_128(T0P2,T1P3,m2,m0));_mm_storeu_si128((__m128i*)(a+j1),montgomery_sub_128(T0P2,T1P3,m2,m0));_mm_storeu_si128((__m128i*)(a+j2),montgomery_add_128(T0M2,T1M3,m2,m0));_mm_storeu_si128((__m128i*)(a+j3),montgomery_sub_128(T0M2,T1M3,m2,m0));}}else{ww=xx*xx,wx=ww*xx;__m128iWW=_mm_set1_epi32(ww.a);__m128iWX=_mm_set1_epi32(wx.a);__m128iXX=_mm_set1_epi32(xx.a);intj0=jh*v;intj1=j0+v;intj2=j1+v;intj3=j2+v;intje=j1;for(;j0<je;j0+=4,j1+=4,j2+=4,j3+=4){__m128iT0=_mm_loadu_si128((__m128i*)(a+j0));__m128iT1=_mm_loadu_si128((__m128i*)(a+j1));__m128iT2=_mm_loadu_si128((__m128i*)(a+j2));__m128iT3=_mm_loadu_si128((__m128i*)(a+j3));T1=montgomery_mul_128(T1,XX,r,m1);T2=montgomery_mul_128(T2,WW,r,m1);T3=montgomery_mul_128(T3,WX,r,m1);__m128iT0P2=montgomery_add_128(T0,T2,m2,m0);__m128iT1P3=montgomery_add_128(T1,T3,m2,m0);__m128iT0M2=montgomery_sub_128(T0,T2,m2,m0);__m128iT1M3=montgomery_mul_128(montgomery_sub_128(T1,T3,m2,m0),Imag,r,m1);_mm_storeu_si128((__m128i*)(a+j0),montgomery_add_128(T0P2,T1P3,m2,m0));_mm_storeu_si128((__m128i*)(a+j1),montgomery_sub_128(T0P2,T1P3,m2,m0));_mm_storeu_si128((__m128i*)(a+j2),montgomery_add_128(T0M2,T1M3,m2,m0));_mm_storeu_si128((__m128i*)(a+j3),montgomery_sub_128(T0M2,T1M3,m2,m0));}}xx*=dw[__builtin_ctz((jh+=4))];}}u<<=2;v>>=2;}}__attribute__((target("sse4.2")))voidintt(mint*a,intn,intnormalize=true){intk=n?__builtin_ctz(n):0;if(k==0)return;if(k==1){minta1=a[1];a[1]=a[0]-a[1];a[0]=a[0]+a1;return;}intu=1<<(k-2);intv=1;mintone=mint(1);mintimag=dy[1];const__m128im0=_mm_set1_epi32(0);const__m128im1=_mm_set1_epi32(mod);const__m128im2=_mm_set1_epi32(mod+mod);const__m128ir=_mm_set1_epi32(mint::r);const__m128iImag=_mm_set1_epi32(imag.a);while(u){if(v==1){mintww=one,xx=one,yy=one;u<<=2;for(intjh=0;jh<u;){ww=xx*xx,yy=xx*imag;mintt0=a[jh+0],t1=a[jh+1];mintt2=a[jh+2],t3=a[jh+3];mintt0p1=t0+t1,t2p3=t2+t3;mintt0m1=(t0-t1)*xx,t2m3=(t2-t3)*yy;a[jh+0]=t0p1+t2p3,a[jh+2]=(t0p1-t2p3)*ww;a[jh+1]=t0m1+t2m3,a[jh+3]=(t0m1-t2m3)*ww;xx*=dy[__builtin_ctz(jh+=4)];}}else{mintww=one,xx=one,yy=one;u<<=2;for(intjh=0;jh<u;){if(jh==0){intj0=0;intj1=v;intj2=v+v;intj3=j2+v;for(;j0<v;j0+=4,j1+=4,j2+=4,j3+=4){__m128iT0=_mm_loadu_si128((__m128i*)(a+j0));__m128iT1=_mm_loadu_si128((__m128i*)(a+j1));__m128iT2=_mm_loadu_si128((__m128i*)(a+j2));__m128iT3=_mm_loadu_si128((__m128i*)(a+j3));__m128iT0P1=montgomery_add_128(T0,T1,m2,m0);__m128iT2P3=montgomery_add_128(T2,T3,m2,m0);__m128iT0M1=montgomery_sub_128(T0,T1,m2,m0);__m128iT2M3=montgomery_mul_128(montgomery_sub_128(T2,T3,m2,m0),Imag,r,m1);_mm_storeu_si128((__m128i*)(a+j0),montgomery_add_128(T0P1,T2P3,m2,m0));_mm_storeu_si128((__m128i*)(a+j2),montgomery_sub_128(T0P1,T2P3,m2,m0));_mm_storeu_si128((__m128i*)(a+j1),montgomery_add_128(T0M1,T2M3,m2,m0));_mm_storeu_si128((__m128i*)(a+j3),montgomery_sub_128(T0M1,T2M3,m2,m0));}}else{ww=xx*xx,yy=xx*imag;__m128iWW=_mm_set1_epi32(ww.a);__m128iXX=_mm_set1_epi32(xx.a);__m128iYY=_mm_set1_epi32(yy.a);intj0=jh*v;intj1=j0+v;intj2=j1+v;intj3=j2+v;intje=j1;for(;j0<je;j0+=4,j1+=4,j2+=4,j3+=4){__m128iT0=_mm_loadu_si128((__m128i*)(a+j0));__m128iT1=_mm_loadu_si128((__m128i*)(a+j1));__m128iT2=_mm_loadu_si128((__m128i*)(a+j2));__m128iT3=_mm_loadu_si128((__m128i*)(a+j3));__m128iT0P1=montgomery_add_128(T0,T1,m2,m0);__m128iT2P3=montgomery_add_128(T2,T3,m2,m0);__m128iT0M1=montgomery_mul_128(montgomery_sub_128(T0,T1,m2,m0),XX,r,m1);__m128iT2M3=montgomery_mul_128(montgomery_sub_128(T2,T3,m2,m0),YY,r,m1);_mm_storeu_si128((__m128i*)(a+j0),montgomery_add_128(T0P1,T2P3,m2,m0));_mm_storeu_si128((__m128i*)(a+j2),montgomery_mul_128(montgomery_sub_128(T0P1,T2P3,m2,m0),WW,r,m1));_mm_storeu_si128((__m128i*)(a+j1),montgomery_add_128(T0M1,T2M3,m2,m0));_mm_storeu_si128((__m128i*)(a+j3),montgomery_mul_128(montgomery_sub_128(T0M1,T2M3,m2,m0),WW,r,m1));}}xx*=dy[__builtin_ctz(jh+=4)];}}u>>=4;v<<=2;}if(k&1){u=1<<(k-1);for(intj=0;j<u;++j){mintajv=a[j]-a[j+u];a[j]+=a[j+u];a[j+u]=ajv;}}if(normalize){mintinvn=one/mint(n);for(inti=0;i<n;i++)a[i]*=invn;}}constexprvector<mint>multiply(constvector<mint>&a,constvector<mint>&b){intl=a.size()+b.size()-1;if(min<int>(a.size(),b.size())<=40){vector<mint>s(l);for(inti=0;i<(int)a.size();++i)for(intj=0;j<(int)b.size();++j)s[i+j]+=a[i]*b[j];returns;}intM=4;while(M<l)M<<=1;for(inti=0;i<(int)a.size();++i)buf1[i].a=a[i].a;for(inti=(int)a.size();i<M;++i)buf1[i].a=0;for(inti=0;i<(int)b.size();++i)buf2[i].a=b[i].a;for(inti=(int)b.size();i<M;++i)buf2[i].a=0;ntt(buf1,M);ntt(buf2,M);for(inti=0;i<M;++i)buf1[i].a=mint::reduce(uint64_t(buf1[i].a)*buf2[i].a);intt(buf1,M,false);vector<mint>s(l);mintinvm=mint(M).inverse();for(inti=0;i<l;++i)s[i]=buf1[i]*invm;returns;}};