#define PROBLEM "https://judge.yosupo.jp/problem/convolution_mod"
#include"../../template/template.hpp"//#include"../../modint/montgomery-modint.hpp"
#include"../../ntt/ntt-avx2.hpp"constexprintMOD=998244353;usingmint=LazyMontgomeryModInt<MOD>;usingvm=vector<mint>;usingnamespaceNyaan;voidNyaan::solve(){NTT<mint>ntt;ini(N,M);vma(N),b(M);in(a,b);autoc=ntt.multiply(a,b);out(c);}
#line 1 "verify/verify-yosupo-ntt/yosupo-convolution-ntt-avx2.test.cpp"
#define PROBLEM "https://judge.yosupo.jp/problem/convolution_mod"
#line 2 "template/template.hpp"
usingnamespacestd;// intrinstic#include<immintrin.h>#include<algorithm>
#include<array>
#include<bitset>
#include<cassert>
#include<cctype>
#include<cfenv>
#include<cfloat>
#include<chrono>
#include<cinttypes>
#include<climits>
#include<cmath>
#include<complex>
#include<cstdarg>
#include<cstddef>
#include<cstdint>
#include<cstdio>
#include<cstdlib>
#include<cstring>
#include<deque>
#include<fstream>
#include<functional>
#include<initializer_list>
#include<iomanip>
#include<ios>
#include<iostream>
#include<istream>
#include<iterator>
#include<limits>
#include<list>
#include<map>
#include<memory>
#include<new>
#include<numeric>
#include<ostream>
#include<queue>
#include<random>
#include<set>
#include<sstream>
#include<stack>
#include<streambuf>
#include<string>
#include<tuple>
#include<type_traits>
#include<typeinfo>
#include<unordered_map>
#include<unordered_set>
#include<utility>
#include<vector>// utility#line 3 "template/util.hpp"
namespaceNyaan{usingll=longlong;usingi64=longlong;usingu64=unsignedlonglong;usingi128=__int128_t;usingu128=__uint128_t;template<typenameT>usingV=vector<T>;template<typenameT>usingVV=vector<vector<T>>;usingvi=vector<int>;usingvl=vector<longlong>;usingvd=V<double>;usingvs=V<string>;usingvvi=vector<vector<int>>;usingvvl=vector<vector<longlong>>;template<typenameT>usingminpq=priority_queue<T,vector<T>,greater<T>>;template<typenameT,typenameU>structP:pair<T,U>{template<typename...Args>P(Args...args):pair<T,U>(args...){}usingpair<T,U>::first;usingpair<T,U>::second;P&operator+=(constP&r){first+=r.first;second+=r.second;return*this;}P&operator-=(constP&r){first-=r.first;second-=r.second;return*this;}P&operator*=(constP&r){first*=r.first;second*=r.second;return*this;}template<typenameS>P&operator*=(constS&r){first*=r,second*=r;return*this;}Poperator+(constP&r)const{returnP(*this)+=r;}Poperator-(constP&r)const{returnP(*this)-=r;}Poperator*(constP&r)const{returnP(*this)*=r;}template<typenameS>Poperator*(constS&r)const{returnP(*this)*=r;}Poperator-()const{returnP{-first,-second};}};usingpl=P<ll,ll>;usingpi=P<int,int>;usingvp=V<pl>;constexprintinf=1001001001;constexprlonglonginfLL=4004004004004004004LL;template<typenameT>intsz(constT&t){returnt.size();}template<typenameT,typenameU>inlineboolamin(T&x,Uy){return(y<x)?(x=y,true):false;}template<typenameT,typenameU>inlineboolamax(T&x,Uy){return(x<y)?(x=y,true):false;}template<typenameT>inlineTMax(constvector<T>&v){return*max_element(begin(v),end(v));}template<typenameT>inlineTMin(constvector<T>&v){return*min_element(begin(v),end(v));}template<typenameT>inlinelonglongSum(constvector<T>&v){returnaccumulate(begin(v),end(v),0LL);}template<typenameT>intlb(constvector<T>&v,constT&a){returnlower_bound(begin(v),end(v),a)-begin(v);}template<typenameT>intub(constvector<T>&v,constT&a){returnupper_bound(begin(v),end(v),a)-begin(v);}constexprlonglongTEN(intn){longlongret=1,x=10;for(;n;x*=x,n>>=1)ret*=(n&1?x:1);returnret;}template<typenameT,typenameU>pair<T,U>mkp(constT&t,constU&u){returnmake_pair(t,u);}template<typenameT>vector<T>mkrui(constvector<T>&v,boolrev=false){vector<T>ret(v.size()+1);if(rev){for(inti=int(v.size())-1;i>=0;i--)ret[i]=v[i]+ret[i+1];}else{for(inti=0;i<int(v.size());i++)ret[i+1]=ret[i]+v[i];}returnret;};template<typenameT>vector<T>mkuni(constvector<T>&v){vector<T>ret(v);sort(ret.begin(),ret.end());ret.erase(unique(ret.begin(),ret.end()),ret.end());returnret;}template<typenameF>vector<int>mkord(intN,Ff){vector<int>ord(N);iota(begin(ord),end(ord),0);sort(begin(ord),end(ord),f);returnord;}template<typenameT>vector<int>mkinv(vector<T>&v){intmax_val=*max_element(begin(v),end(v));vector<int>inv(max_val+1,-1);for(inti=0;i<(int)v.size();i++)inv[v[i]]=i;returninv;}vector<int>mkiota(intn){vector<int>ret(n);iota(begin(ret),end(ret),0);returnret;}template<typenameT>Tmkrev(constT&v){Tw{v};reverse(begin(w),end(w));returnw;}template<typenameT>boolnxp(T&v){returnnext_permutation(begin(v),end(v));}// 返り値の型は入力の T に依存// i 要素目 : [0, a[i])template<typenameT>vector<vector<T>>product(constvector<T>&a){vector<vector<T>>ret;vector<T>v;autodfs=[&](autorc,inti)->void{if(i==(int)a.size()){ret.push_back(v);return;}for(intj=0;j<a[i];j++)v.push_back(j),rc(rc,i+1),v.pop_back();};dfs(dfs,0);returnret;}// F : void(T&), mod を取る操作// T : 整数型のときはオーバーフローに注意するtemplate<typenameT,typenameF>TPower(Ta,longlongn,constT&I,F&&f){static_assert(std::is_invocable_r_v<void,F&,T&>,"Power callback must be callable as void(T&)");Tres=I;for(;n;std::invoke(f,a=a*a),n>>=1){if(n&1)std::invoke(f,res=res*a);}returnres;}// T : 整数型のときはオーバーフローに注意するtemplate<typenameT>TPower(Ta,longlongn,constT&I=T{1}){autono_op=[](T&)->void{};returnPower(a,n,I,no_op);}template<typenameT>TRev(constT&v){Tres=v;reverse(begin(res),end(res));returnres;}template<typenameT>vector<T>Transpose(constvector<T>&v){usingU=typenameT::value_type;if(v.empty())return{};intH=v.size(),W=v[0].size();vectorres(W,T(H,U{}));for(inti=0;i<H;i++){for(intj=0;j<W;j++){res[j][i]=v[i][j];}}returnres;}template<typenameT>vector<T>Rotate(constvector<T>&v,intclockwise=true){usingU=typenameT::value_type;intH=v.size(),W=v[0].size();vectorres(W,T(H,U{}));for(inti=0;i<H;i++){for(intj=0;j<W;j++){if(clockwise){res[W-1-j][i]=v[i][j];}else{res[j][H-1-i]=v[i][j];}}}returnres;}}// namespace Nyaan#line 58 "template/template.hpp"
// bit operation#line 1 "template/bitop.hpp"
namespaceNyaan{__attribute__((target("popcnt")))inlineintpopcnt(constu64&a){return__builtin_popcountll(a);}inlineintlsb(constu64&a){returna?__builtin_ctzll(a):64;}inlineintctz(constu64&a){returna?__builtin_ctzll(a):64;}inlineintmsb(constu64&a){returna?63-__builtin_clzll(a):-1;}template<typenameT>inlineintgbit(constT&a,inti){return(a>>i)&1;}template<typenameT>inlinevoidsbit(T&a,inti,boolb){if(gbit(a,i)!=b)a^=T(1)<<i;}constexprlonglongPW(intn){return1LL<<n;}constexprlonglongMSK(intn){return(1LL<<n)-1;}}// namespace Nyaan#line 61 "template/template.hpp"
// inout#line 1 "template/inout.hpp"
namespaceNyaan{template<typenameT,typenameU>ostream&operator<<(ostream&os,constpair<T,U>&p){os<<p.first<<" "<<p.second;returnos;}template<typenameT,typenameU>istream&operator>>(istream&is,pair<T,U>&p){is>>p.first>>p.second;returnis;}template<typenameT>ostream&operator<<(ostream&os,constvector<T>&v){ints=(int)v.size();for(inti=0;i<s;i++)os<<(i?" ":"")<<v[i];returnos;}template<typenameT>istream&operator>>(istream&is,vector<T>&v){for(auto&x:v)is>>x;returnis;}istream&operator>>(istream&is,__int128_t&x){stringS;is>>S;x=0;intflag=0;for(auto&c:S){if(c=='-'){flag=true;continue;}x*=10;x+=c-'0';}if(flag)x=-x;returnis;}istream&operator>>(istream&is,__uint128_t&x){stringS;is>>S;x=0;for(auto&c:S){x*=10;x+=c-'0';}returnis;}ostream&operator<<(ostream&os,__int128_tx){if(x==0)returnos<<0;if(x<0)os<<'-',x=-x;stringS;while(x)S.push_back('0'+x%10),x/=10;reverse(begin(S),end(S));returnos<<S;}ostream&operator<<(ostream&os,__uint128_tx){if(x==0)returnos<<0;stringS;while(x)S.push_back('0'+x%10),x/=10;reverse(begin(S),end(S));returnos<<S;}voidin(){}template<typenameT,class...U>voidin(T&t,U&...u){cin>>t;in(u...);}voidout(){cout<<"\n";}template<typenameT,class...U,charsep=' '>voidout(constT&t,constU&...u){cout<<t;if(sizeof...(u))cout<<sep;out(u...);}structIoSetupNya{IoSetupNya(){cin.tie(nullptr);ios::sync_with_stdio(false);cout<<fixed<<setprecision(15);cerr<<fixed<<setprecision(7);}}iosetupnya;}// namespace Nyaan#line 64 "template/template.hpp"
// debug#line 1 "template/debug.hpp"
namespaceDebugImpl{template<typenameU,typename=void>structis_specialize:false_type{};template<typenameU>structis_specialize<U,typenameconditional<false,typenameU::iterator,void>::type>:true_type{};template<typenameU>structis_specialize<U,typenameconditional<false,decltype(U::first),void>::type>:true_type{};template<typenameU>structis_specialize<U,enable_if_t<is_integral<U>::value,void>>:true_type{};voiddump(constchar&t){cerr<<t;}voiddump(conststring&t){cerr<<t;}voiddump(constbool&t){cerr<<(t?"true":"false");}voiddump(__int128_tt){if(t==0)cerr<<0;if(t<0)cerr<<'-',t=-t;stringS;while(t)S.push_back('0'+t%10),t/=10;reverse(begin(S),end(S));cerr<<S;}voiddump(__uint128_tt){if(t==0)cerr<<0;stringS;while(t)S.push_back('0'+t%10),t/=10;reverse(begin(S),end(S));cerr<<S;}template<typenameU,enable_if_t<!is_specialize<U>::value,nullptr_t>=nullptr>voiddump(constU&t){cerr<<t;}template<typenameT>voiddump(constT&t,enable_if_t<is_integral<T>::value>*=nullptr){stringres;if(t==Nyaan::inf)res="inf";ifconstexpr(is_signed<T>::value){if(t==-Nyaan::inf)res="-inf";}ifconstexpr(sizeof(T)==8){if(t==Nyaan::infLL)res="inf";ifconstexpr(is_signed<T>::value){if(t==-Nyaan::infLL)res="-inf";}}if(res.empty())res=to_string(t);cerr<<res;}template<typenameT,typenameU>voiddump(constpair<T,U>&);template<typenameT>voiddump(constpair<T*,int>&);template<typenameT>voiddump(constT&t,enable_if_t<!is_void<typenameT::iterator>::value>*=nullptr){cerr<<"[ ";for(autoit=t.begin();it!=t.end();){dump(*it);cerr<<(++it==t.end()?"":", ");}cerr<<" ]";}template<typenameT,typenameU>voiddump(constpair<T,U>&t){cerr<<"( ";dump(t.first);cerr<<", ";dump(t.second);cerr<<" )";}template<typenameT>voiddump(constpair<T*,int>&t){cerr<<"[ ";for(inti=0;i<t.second;i++){dump(t.first[i]);cerr<<(i==t.second-1?"":", ");}cerr<<" ]";}voidtrace(){cerr<<endl;}template<typenameHead,typename...Tail>voidtrace(Head&&head,Tail&&...tail){cerr<<" ";dump(head);if(sizeof...(tail)!=0)cerr<<",";trace(std::forward<Tail>(tail)...);}}// namespace DebugImpl#ifdef NyaanDebug
#define trc(...) \
do { \
cerr << "## " << #__VA_ARGS__ << " = "; \
DebugImpl::trace(__VA_ARGS__); \
} while (0)
#else
#define trc(...) (void(0))
#endif
#ifdef NyaanLocal
#define trc2(...) \
do { \
cerr << "## " << #__VA_ARGS__ << " = "; \
DebugImpl::trace(__VA_ARGS__); \
} while (0)
#else
#define trc2(...) (void(0))
#endif
#line 67 "template/template.hpp"
// macro#line 1 "template/macro.hpp"
#define each(x, v) for (auto&& x : v)
#define each2(x, y, v) for (auto&& [x, y] : v)
#define all(v) (v).begin(), (v).end()
#define rep(i, N) for (long long i = 0; i < (long long)(N); i++)
#define repr(i, N) for (long long i = (long long)(N)-1; i >= 0; i--)
#define rep1(i, N) for (long long i = 1; i <= (long long)(N); i++)
#define repr1(i, N) for (long long i = (N); (long long)(i) > 0; i--)
#define reg(i, a, b) for (long long i = (a); i < (b); i++)
#define regr(i, a, b) for (long long i = (b)-1; i >= (a); i--)
#define fi first
#define se second
#define ini(...) \
int __VA_ARGS__; \
in(__VA_ARGS__)
#define inl(...) \
long long __VA_ARGS__; \
in(__VA_ARGS__)
#define ins(...) \
string __VA_ARGS__; \
in(__VA_ARGS__)
#define in2(s, t) \
for (int i = 0; i < (int)s.size(); i++) { \
in(s[i], t[i]); \
}
#define in3(s, t, u) \
for (int i = 0; i < (int)s.size(); i++) { \
in(s[i], t[i], u[i]); \
}
#define in4(s, t, u, v) \
for (int i = 0; i < (int)s.size(); i++) { \
in(s[i], t[i], u[i], v[i]); \
}
#define die(...) \
do { \
Nyaan::out(__VA_ARGS__); \
return; \
} while (0)
#line 70 "template/template.hpp"
namespaceNyaan{voidsolve();}intmain(){Nyaan::solve();}#line 4 "verify/verify-yosupo-ntt/yosupo-convolution-ntt-avx2.test.cpp"
//#line 2 "modint/montgomery-modint.hpp"
#line 5 "modint/montgomery-modint.hpp"
template<uint32_tmod>structLazyMontgomeryModInt{usingmint=LazyMontgomeryModInt;usingi32=int32_t;usingu32=uint32_t;usingu64=uint64_t;staticconstexpru32get_r(){u32ret=mod;for(i32i=0;i<4;++i)ret*=2-mod*ret;returnret;}staticconstexpru32r=get_r();staticconstexpru32n2=-u64(mod)%mod;static_assert(mod<(1<<30),"invalid, mod >= 2 ^ 30");static_assert((mod&1)==1,"invalid, mod % 2 == 0");static_assert(r*mod==1,"this code has bugs.");u32a;constexprLazyMontgomeryModInt():a(0){}constexprLazyMontgomeryModInt(constint64_t&b):a(reduce(u64(b%mod+mod)*n2)){};staticconstexpru32reduce(constu64&b){return(b+u64(u32(b)*u32(-r))*mod)>>32;}constexprmint&operator+=(constmint&b){if(i32(a+=b.a-2*mod)<0)a+=2*mod;return*this;}constexprmint&operator-=(constmint&b){if(i32(a-=b.a)<0)a+=2*mod;return*this;}constexprmint&operator*=(constmint&b){a=reduce(u64(a)*b.a);return*this;}constexprmint&operator/=(constmint&b){*this*=b.inverse();return*this;}constexprmintoperator+(constmint&b)const{returnmint(*this)+=b;}constexprmintoperator-(constmint&b)const{returnmint(*this)-=b;}constexprmintoperator*(constmint&b)const{returnmint(*this)*=b;}constexprmintoperator/(constmint&b)const{returnmint(*this)/=b;}constexprbooloperator==(constmint&b)const{return(a>=mod?a-mod:a)==(b.a>=mod?b.a-mod:b.a);}constexprbooloperator!=(constmint&b)const{return(a>=mod?a-mod:a)!=(b.a>=mod?b.a-mod:b.a);}constexprmintoperator-()const{returnmint()-mint(*this);}constexprmintoperator+()const{returnmint(*this);}constexprmintpow(u64n)const{mintret(1),mul(*this);while(n>0){if(n&1)ret*=mul;mul*=mul;n>>=1;}returnret;}constexprmintinverse()const{intx=get(),y=mod,u=1,v=0,t=0,tmp=0;while(y>0){t=x/y;x-=t*y,u-=t*v;tmp=x,x=y,y=tmp;tmp=u,u=v,v=tmp;}returnmint{u};}friendstd::ostream&operator<<(std::ostream&os,constmint&b){returnos<<b.get();}friendstd::istream&operator>>(std::istream&is,mint&b){int64_tt;is>>t;b=LazyMontgomeryModInt<mod>(t);return(is);}constexpru32get()const{u32ret=reduce(a);returnret>=mod?ret-mod:ret;}staticconstexpru32get_mod(){returnmod;}};#line 2 "ntt/ntt-avx2.hpp"
#line 2 "modint/simd-montgomery.hpp"
#line 4 "modint/simd-montgomery.hpp"
__attribute__((target("sse4.2")))inline__m128imy128_mullo_epu32(const__m128i&a,const__m128i&b){return_mm_mullo_epi32(a,b);}__attribute__((target("sse4.2")))inline__m128imy128_mulhi_epu32(const__m128i&a,const__m128i&b){__m128ia13=_mm_shuffle_epi32(a,0xF5);__m128ib13=_mm_shuffle_epi32(b,0xF5);__m128iprod02=_mm_mul_epu32(a,b);__m128iprod13=_mm_mul_epu32(a13,b13);__m128iprod=_mm_unpackhi_epi64(_mm_unpacklo_epi32(prod02,prod13),_mm_unpackhi_epi32(prod02,prod13));returnprod;}__attribute__((target("sse4.2")))inline__m128imontgomery_mul_128(const__m128i&a,const__m128i&b,const__m128i&r,const__m128i&m1){return_mm_sub_epi32(_mm_add_epi32(my128_mulhi_epu32(a,b),m1),my128_mulhi_epu32(my128_mullo_epu32(my128_mullo_epu32(a,b),r),m1));}__attribute__((target("sse4.2")))inline__m128imontgomery_add_128(const__m128i&a,const__m128i&b,const__m128i&m2,const__m128i&m0){__m128iret=_mm_sub_epi32(_mm_add_epi32(a,b),m2);return_mm_add_epi32(_mm_and_si128(_mm_cmpgt_epi32(m0,ret),m2),ret);}__attribute__((target("sse4.2")))inline__m128imontgomery_sub_128(const__m128i&a,const__m128i&b,const__m128i&m2,const__m128i&m0){__m128iret=_mm_sub_epi32(a,b);return_mm_add_epi32(_mm_and_si128(_mm_cmpgt_epi32(m0,ret),m2),ret);}__attribute__((target("avx2")))inline__m256imy256_mullo_epu32(const__m256i&a,const__m256i&b){return_mm256_mullo_epi32(a,b);}__attribute__((target("avx2")))inline__m256imy256_mulhi_epu32(const__m256i&a,const__m256i&b){__m256ia13=_mm256_shuffle_epi32(a,0xF5);__m256ib13=_mm256_shuffle_epi32(b,0xF5);__m256iprod02=_mm256_mul_epu32(a,b);__m256iprod13=_mm256_mul_epu32(a13,b13);__m256iprod=_mm256_unpackhi_epi64(_mm256_unpacklo_epi32(prod02,prod13),_mm256_unpackhi_epi32(prod02,prod13));returnprod;}__attribute__((target("avx2")))inline__m256imontgomery_mul_256(const__m256i&a,const__m256i&b,const__m256i&r,const__m256i&m1){return_mm256_sub_epi32(_mm256_add_epi32(my256_mulhi_epu32(a,b),m1),my256_mulhi_epu32(my256_mullo_epu32(my256_mullo_epu32(a,b),r),m1));}__attribute__((target("avx2")))inline__m256imontgomery_add_256(const__m256i&a,const__m256i&b,const__m256i&m2,const__m256i&m0){__m256iret=_mm256_sub_epi32(_mm256_add_epi32(a,b),m2);return_mm256_add_epi32(_mm256_and_si256(_mm256_cmpgt_epi32(m0,ret),m2),ret);}__attribute__((target("avx2")))inline__m256imontgomery_sub_256(const__m256i&a,const__m256i&b,const__m256i&m2,const__m256i&m0){__m256iret=_mm256_sub_epi32(a,b);return_mm256_add_epi32(_mm256_and_si256(_mm256_cmpgt_epi32(m0,ret),m2),ret);}#line 4 "ntt/ntt-avx2.hpp"
namespacentt_inner{usingu64=uint64_t;constexpruint32_tget_pr(uint32_tmod){if(mod==2)return1;u64ds[32]={};intidx=0;u64m=mod-1;for(u64i=2;i*i<=m;++i){if(m%i==0){ds[idx++]=i;while(m%i==0)m/=i;}}if(m!=1)ds[idx++]=m;uint32_tpr=2;while(1){intflg=1;for(inti=0;i<idx;++i){u64a=pr,b=(mod-1)/ds[i],r=1;while(b){if(b&1)r=r*a%mod;a=a*a%mod;b>>=1;}if(r==1){flg=0;break;}}if(flg==1)break;++pr;}returnpr;}constexprintSZ_FFT_BUF=1<<23;uint32_t_buf1[SZ_FFT_BUF]__attribute__((aligned(64)));uint32_t_buf2[SZ_FFT_BUF]__attribute__((aligned(64)));}// namespace ntt_innertemplate<typenamemint>structNTT{staticconstexpruint32_tmod=mint::get_mod();staticconstexpruint32_tpr=ntt_inner::get_pr(mint::get_mod());staticconstexprintlevel=__builtin_ctzll(mod-1);mintdw[level],dy[level];mint*buf1,*buf2;constexprNTT(){setwy(level);unionraw_cast{mintdat;uint32_t_;};buf1=&(((raw_cast*)(ntt_inner::_buf1))->dat);buf2=&(((raw_cast*)(ntt_inner::_buf2))->dat);}constexprvoidsetwy(intk){mintw[level],y[level];w[k-1]=mint(pr).pow((mod-1)/(1<<k));y[k-1]=w[k-1].inverse();for(inti=k-2;i>0;--i)w[i]=w[i+1]*w[i+1],y[i]=y[i+1]*y[i+1];dw[0]=dy[0]=w[1]*w[1];dw[1]=w[1],dy[1]=y[1],dw[2]=w[2],dy[2]=y[2];for(inti=3;i<k;++i){dw[i]=dw[i-1]*y[i-2]*w[i];dy[i]=dy[i-1]*w[i-2]*y[i];}}__attribute__((target("avx2")))voidntt(mint*a,intn){intk=n?__builtin_ctz(n):0;if(k==0)return;if(k==1){minta1=a[1];a[1]=a[0]-a[1];a[0]=a[0]+a1;return;}if(k&1){intv=1<<(k-1);if(v<8){for(intj=0;j<v;++j){mintajv=a[j+v];a[j+v]=a[j]-ajv;a[j]+=ajv;}}else{const__m256im0=_mm256_set1_epi32(0);const__m256im2=_mm256_set1_epi32(mod+mod);intj0=0;intj1=v;for(;j0<v;j0+=8,j1+=8){__m256iT0=_mm256_loadu_si256((__m256i*)(a+j0));__m256iT1=_mm256_loadu_si256((__m256i*)(a+j1));__m256inaj=montgomery_add_256(T0,T1,m2,m0);__m256inajv=montgomery_sub_256(T0,T1,m2,m0);_mm256_storeu_si256((__m256i*)(a+j0),naj);_mm256_storeu_si256((__m256i*)(a+j1),najv);}}}intu=1<<(2+(k&1));intv=1<<(k-2-(k&1));mintone=mint(1);mintimag=dw[1];while(v){if(v==1){mintww=one,xx=one,wx=one;for(intjh=0;jh<u;){ww=xx*xx,wx=ww*xx;mintt0=a[jh+0],t1=a[jh+1]*xx;mintt2=a[jh+2]*ww,t3=a[jh+3]*wx;mintt0p2=t0+t2,t1p3=t1+t3;mintt0m2=t0-t2,t1m3=(t1-t3)*imag;a[jh+0]=t0p2+t1p3,a[jh+1]=t0p2-t1p3;a[jh+2]=t0m2+t1m3,a[jh+3]=t0m2-t1m3;xx*=dw[__builtin_ctz((jh+=4))];}}elseif(v==4){const__m128im0=_mm_set1_epi32(0);const__m128im1=_mm_set1_epi32(mod);const__m128im2=_mm_set1_epi32(mod+mod);const__m128ir=_mm_set1_epi32(mint::r);const__m128iImag=_mm_set1_epi32(imag.a);mintww=one,xx=one,wx=one;for(intjh=0;jh<u;){if(jh==0){intj0=0;intj1=v;intj2=j1+v;intj3=j2+v;intje=v;for(;j0<je;j0+=4,j1+=4,j2+=4,j3+=4){const__m128iT0=_mm_loadu_si128((__m128i*)(a+j0));const__m128iT1=_mm_loadu_si128((__m128i*)(a+j1));const__m128iT2=_mm_loadu_si128((__m128i*)(a+j2));const__m128iT3=_mm_loadu_si128((__m128i*)(a+j3));const__m128iT0P2=montgomery_add_128(T0,T2,m2,m0);const__m128iT1P3=montgomery_add_128(T1,T3,m2,m0);const__m128iT0M2=montgomery_sub_128(T0,T2,m2,m0);const__m128iT1M3=montgomery_mul_128(montgomery_sub_128(T1,T3,m2,m0),Imag,r,m1);_mm_storeu_si128((__m128i*)(a+j0),montgomery_add_128(T0P2,T1P3,m2,m0));_mm_storeu_si128((__m128i*)(a+j1),montgomery_sub_128(T0P2,T1P3,m2,m0));_mm_storeu_si128((__m128i*)(a+j2),montgomery_add_128(T0M2,T1M3,m2,m0));_mm_storeu_si128((__m128i*)(a+j3),montgomery_sub_128(T0M2,T1M3,m2,m0));}}else{ww=xx*xx,wx=ww*xx;const__m128iWW=_mm_set1_epi32(ww.a);const__m128iWX=_mm_set1_epi32(wx.a);const__m128iXX=_mm_set1_epi32(xx.a);intj0=jh*v;intj1=j0+v;intj2=j1+v;intj3=j2+v;intje=j1;for(;j0<je;j0+=4,j1+=4,j2+=4,j3+=4){const__m128iT0=_mm_loadu_si128((__m128i*)(a+j0));const__m128iT1=_mm_loadu_si128((__m128i*)(a+j1));const__m128iT2=_mm_loadu_si128((__m128i*)(a+j2));const__m128iT3=_mm_loadu_si128((__m128i*)(a+j3));const__m128iMT1=montgomery_mul_128(T1,XX,r,m1);const__m128iMT2=montgomery_mul_128(T2,WW,r,m1);const__m128iMT3=montgomery_mul_128(T3,WX,r,m1);const__m128iT0P2=montgomery_add_128(T0,MT2,m2,m0);const__m128iT1P3=montgomery_add_128(MT1,MT3,m2,m0);const__m128iT0M2=montgomery_sub_128(T0,MT2,m2,m0);const__m128iT1M3=montgomery_mul_128(montgomery_sub_128(MT1,MT3,m2,m0),Imag,r,m1);_mm_storeu_si128((__m128i*)(a+j0),montgomery_add_128(T0P2,T1P3,m2,m0));_mm_storeu_si128((__m128i*)(a+j1),montgomery_sub_128(T0P2,T1P3,m2,m0));_mm_storeu_si128((__m128i*)(a+j2),montgomery_add_128(T0M2,T1M3,m2,m0));_mm_storeu_si128((__m128i*)(a+j3),montgomery_sub_128(T0M2,T1M3,m2,m0));}}xx*=dw[__builtin_ctz((jh+=4))];}}else{const__m256im0=_mm256_set1_epi32(0);const__m256im1=_mm256_set1_epi32(mod);const__m256im2=_mm256_set1_epi32(mod+mod);const__m256ir=_mm256_set1_epi32(mint::r);const__m256iImag=_mm256_set1_epi32(imag.a);mintww=one,xx=one,wx=one;for(intjh=0;jh<u;){if(jh==0){intj0=0;intj1=v;intj2=j1+v;intj3=j2+v;intje=v;for(;j0<je;j0+=8,j1+=8,j2+=8,j3+=8){const__m256iT0=_mm256_loadu_si256((__m256i*)(a+j0));const__m256iT1=_mm256_loadu_si256((__m256i*)(a+j1));const__m256iT2=_mm256_loadu_si256((__m256i*)(a+j2));const__m256iT3=_mm256_loadu_si256((__m256i*)(a+j3));const__m256iT0P2=montgomery_add_256(T0,T2,m2,m0);const__m256iT1P3=montgomery_add_256(T1,T3,m2,m0);const__m256iT0M2=montgomery_sub_256(T0,T2,m2,m0);const__m256iT1M3=montgomery_mul_256(montgomery_sub_256(T1,T3,m2,m0),Imag,r,m1);_mm256_storeu_si256((__m256i*)(a+j0),montgomery_add_256(T0P2,T1P3,m2,m0));_mm256_storeu_si256((__m256i*)(a+j1),montgomery_sub_256(T0P2,T1P3,m2,m0));_mm256_storeu_si256((__m256i*)(a+j2),montgomery_add_256(T0M2,T1M3,m2,m0));_mm256_storeu_si256((__m256i*)(a+j3),montgomery_sub_256(T0M2,T1M3,m2,m0));}}else{ww=xx*xx,wx=ww*xx;const__m256iWW=_mm256_set1_epi32(ww.a);const__m256iWX=_mm256_set1_epi32(wx.a);const__m256iXX=_mm256_set1_epi32(xx.a);intj0=jh*v;intj1=j0+v;intj2=j1+v;intj3=j2+v;intje=j1;for(;j0<je;j0+=8,j1+=8,j2+=8,j3+=8){const__m256iT0=_mm256_loadu_si256((__m256i*)(a+j0));const__m256iT1=_mm256_loadu_si256((__m256i*)(a+j1));const__m256iT2=_mm256_loadu_si256((__m256i*)(a+j2));const__m256iT3=_mm256_loadu_si256((__m256i*)(a+j3));const__m256iMT1=montgomery_mul_256(T1,XX,r,m1);const__m256iMT2=montgomery_mul_256(T2,WW,r,m1);const__m256iMT3=montgomery_mul_256(T3,WX,r,m1);const__m256iT0P2=montgomery_add_256(T0,MT2,m2,m0);const__m256iT1P3=montgomery_add_256(MT1,MT3,m2,m0);const__m256iT0M2=montgomery_sub_256(T0,MT2,m2,m0);const__m256iT1M3=montgomery_mul_256(montgomery_sub_256(MT1,MT3,m2,m0),Imag,r,m1);_mm256_storeu_si256((__m256i*)(a+j0),montgomery_add_256(T0P2,T1P3,m2,m0));_mm256_storeu_si256((__m256i*)(a+j1),montgomery_sub_256(T0P2,T1P3,m2,m0));_mm256_storeu_si256((__m256i*)(a+j2),montgomery_add_256(T0M2,T1M3,m2,m0));_mm256_storeu_si256((__m256i*)(a+j3),montgomery_sub_256(T0M2,T1M3,m2,m0));}}xx*=dw[__builtin_ctz((jh+=4))];}}u<<=2;v>>=2;}}__attribute__((target("avx2")))voidintt(mint*a,intn,intnormalize=true){intk=n?__builtin_ctz(n):0;if(k==0)return;if(k==1){minta1=a[1];a[1]=a[0]-a[1];a[0]=a[0]+a1;if(normalize){a[0]*=mint(2).inverse();a[1]*=mint(2).inverse();}return;}intu=1<<(k-2);intv=1;mintone=mint(1);mintimag=dy[1];while(u){if(v==1){mintww=one,xx=one,yy=one;u<<=2;for(intjh=0;jh<u;){ww=xx*xx,yy=xx*imag;mintt0=a[jh+0],t1=a[jh+1];mintt2=a[jh+2],t3=a[jh+3];mintt0p1=t0+t1,t2p3=t2+t3;mintt0m1=(t0-t1)*xx,t2m3=(t2-t3)*yy;a[jh+0]=t0p1+t2p3,a[jh+2]=(t0p1-t2p3)*ww;a[jh+1]=t0m1+t2m3,a[jh+3]=(t0m1-t2m3)*ww;xx*=dy[__builtin_ctz(jh+=4)];}}elseif(v==4){const__m128im0=_mm_set1_epi32(0);const__m128im1=_mm_set1_epi32(mod);const__m128im2=_mm_set1_epi32(mod+mod);const__m128ir=_mm_set1_epi32(mint::r);const__m128iImag=_mm_set1_epi32(imag.a);mintww=one,xx=one,yy=one;u<<=2;for(intjh=0;jh<u;){if(jh==0){intj0=0;intj1=v;intj2=v+v;intj3=j2+v;for(;j0<v;j0+=4,j1+=4,j2+=4,j3+=4){const__m128iT0=_mm_loadu_si128((__m128i*)(a+j0));const__m128iT1=_mm_loadu_si128((__m128i*)(a+j1));const__m128iT2=_mm_loadu_si128((__m128i*)(a+j2));const__m128iT3=_mm_loadu_si128((__m128i*)(a+j3));const__m128iT0P1=montgomery_add_128(T0,T1,m2,m0);const__m128iT2P3=montgomery_add_128(T2,T3,m2,m0);const__m128iT0M1=montgomery_sub_128(T0,T1,m2,m0);const__m128iT2M3=montgomery_mul_128(montgomery_sub_128(T2,T3,m2,m0),Imag,r,m1);_mm_storeu_si128((__m128i*)(a+j0),montgomery_add_128(T0P1,T2P3,m2,m0));_mm_storeu_si128((__m128i*)(a+j2),montgomery_sub_128(T0P1,T2P3,m2,m0));_mm_storeu_si128((__m128i*)(a+j1),montgomery_add_128(T0M1,T2M3,m2,m0));_mm_storeu_si128((__m128i*)(a+j3),montgomery_sub_128(T0M1,T2M3,m2,m0));}}else{ww=xx*xx,yy=xx*imag;const__m128iWW=_mm_set1_epi32(ww.a);const__m128iXX=_mm_set1_epi32(xx.a);const__m128iYY=_mm_set1_epi32(yy.a);intj0=jh*v;intj1=j0+v;intj2=j1+v;intj3=j2+v;intje=j1;for(;j0<je;j0+=4,j1+=4,j2+=4,j3+=4){const__m128iT0=_mm_loadu_si128((__m128i*)(a+j0));const__m128iT1=_mm_loadu_si128((__m128i*)(a+j1));const__m128iT2=_mm_loadu_si128((__m128i*)(a+j2));const__m128iT3=_mm_loadu_si128((__m128i*)(a+j3));const__m128iT0P1=montgomery_add_128(T0,T1,m2,m0);const__m128iT2P3=montgomery_add_128(T2,T3,m2,m0);const__m128iT0M1=montgomery_mul_128(montgomery_sub_128(T0,T1,m2,m0),XX,r,m1);__m128iT2M3=montgomery_mul_128(montgomery_sub_128(T2,T3,m2,m0),YY,r,m1);_mm_storeu_si128((__m128i*)(a+j0),montgomery_add_128(T0P1,T2P3,m2,m0));_mm_storeu_si128((__m128i*)(a+j2),montgomery_mul_128(montgomery_sub_128(T0P1,T2P3,m2,m0),WW,r,m1));_mm_storeu_si128((__m128i*)(a+j1),montgomery_add_128(T0M1,T2M3,m2,m0));_mm_storeu_si128((__m128i*)(a+j3),montgomery_mul_128(montgomery_sub_128(T0M1,T2M3,m2,m0),WW,r,m1));}}xx*=dy[__builtin_ctz(jh+=4)];}}else{const__m256im0=_mm256_set1_epi32(0);const__m256im1=_mm256_set1_epi32(mod);const__m256im2=_mm256_set1_epi32(mod+mod);const__m256ir=_mm256_set1_epi32(mint::r);const__m256iImag=_mm256_set1_epi32(imag.a);mintww=one,xx=one,yy=one;u<<=2;for(intjh=0;jh<u;){if(jh==0){intj0=0;intj1=v;intj2=v+v;intj3=j2+v;for(;j0<v;j0+=8,j1+=8,j2+=8,j3+=8){const__m256iT0=_mm256_loadu_si256((__m256i*)(a+j0));const__m256iT1=_mm256_loadu_si256((__m256i*)(a+j1));const__m256iT2=_mm256_loadu_si256((__m256i*)(a+j2));const__m256iT3=_mm256_loadu_si256((__m256i*)(a+j3));const__m256iT0P1=montgomery_add_256(T0,T1,m2,m0);const__m256iT2P3=montgomery_add_256(T2,T3,m2,m0);const__m256iT0M1=montgomery_sub_256(T0,T1,m2,m0);const__m256iT2M3=montgomery_mul_256(montgomery_sub_256(T2,T3,m2,m0),Imag,r,m1);_mm256_storeu_si256((__m256i*)(a+j0),montgomery_add_256(T0P1,T2P3,m2,m0));_mm256_storeu_si256((__m256i*)(a+j2),montgomery_sub_256(T0P1,T2P3,m2,m0));_mm256_storeu_si256((__m256i*)(a+j1),montgomery_add_256(T0M1,T2M3,m2,m0));_mm256_storeu_si256((__m256i*)(a+j3),montgomery_sub_256(T0M1,T2M3,m2,m0));}}else{ww=xx*xx,yy=xx*imag;const__m256iWW=_mm256_set1_epi32(ww.a);const__m256iXX=_mm256_set1_epi32(xx.a);const__m256iYY=_mm256_set1_epi32(yy.a);intj0=jh*v;intj1=j0+v;intj2=j1+v;intj3=j2+v;intje=j1;for(;j0<je;j0+=8,j1+=8,j2+=8,j3+=8){const__m256iT0=_mm256_loadu_si256((__m256i*)(a+j0));const__m256iT1=_mm256_loadu_si256((__m256i*)(a+j1));const__m256iT2=_mm256_loadu_si256((__m256i*)(a+j2));const__m256iT3=_mm256_loadu_si256((__m256i*)(a+j3));const__m256iT0P1=montgomery_add_256(T0,T1,m2,m0);const__m256iT2P3=montgomery_add_256(T2,T3,m2,m0);const__m256iT0M1=montgomery_mul_256(montgomery_sub_256(T0,T1,m2,m0),XX,r,m1);const__m256iT2M3=montgomery_mul_256(montgomery_sub_256(T2,T3,m2,m0),YY,r,m1);_mm256_storeu_si256((__m256i*)(a+j0),montgomery_add_256(T0P1,T2P3,m2,m0));_mm256_storeu_si256((__m256i*)(a+j2),montgomery_mul_256(montgomery_sub_256(T0P1,T2P3,m2,m0),WW,r,m1));_mm256_storeu_si256((__m256i*)(a+j1),montgomery_add_256(T0M1,T2M3,m2,m0));_mm256_storeu_si256((__m256i*)(a+j3),montgomery_mul_256(montgomery_sub_256(T0M1,T2M3,m2,m0),WW,r,m1));}}xx*=dy[__builtin_ctz(jh+=4)];}}u>>=4;v<<=2;}if(k&1){v=1<<(k-1);if(v<8){for(intj=0;j<v;++j){mintajv=a[j]-a[j+v];a[j]+=a[j+v];a[j+v]=ajv;}}else{const__m256im0=_mm256_set1_epi32(0);const__m256im2=_mm256_set1_epi32(mod+mod);intj0=0;intj1=v;for(;j0<v;j0+=8,j1+=8){const__m256iT0=_mm256_loadu_si256((__m256i*)(a+j0));const__m256iT1=_mm256_loadu_si256((__m256i*)(a+j1));__m256inaj=montgomery_add_256(T0,T1,m2,m0);__m256inajv=montgomery_sub_256(T0,T1,m2,m0);_mm256_storeu_si256((__m256i*)(a+j0),naj);_mm256_storeu_si256((__m256i*)(a+j1),najv);}}}if(normalize){mintinvn=mint(n).inverse();for(inti=0;i<n;i++)a[i]*=invn;}}__attribute__((target("avx2")))voidinplace_multiply(intl1,intl2,intzero_padding=true){intl=l1+l2-1;intM=4;while(M<l)M<<=1;if(zero_padding){for(inti=l1;i<M;i++)ntt_inner::_buf1[i]=0;for(inti=l2;i<M;i++)ntt_inner::_buf2[i]=0;}const__m256im0=_mm256_set1_epi32(0);const__m256im1=_mm256_set1_epi32(mod);const__m256ir=_mm256_set1_epi32(mint::r);const__m256iN2=_mm256_set1_epi32(mint::n2);for(inti=0;i<l1;i+=8){__m256ia=_mm256_loadu_si256((__m256i*)(ntt_inner::_buf1+i));__m256ib=montgomery_mul_256(a,N2,r,m1);_mm256_storeu_si256((__m256i*)(ntt_inner::_buf1+i),b);}for(inti=0;i<l2;i+=8){__m256ia=_mm256_loadu_si256((__m256i*)(ntt_inner::_buf2+i));__m256ib=montgomery_mul_256(a,N2,r,m1);_mm256_storeu_si256((__m256i*)(ntt_inner::_buf2+i),b);}ntt(buf1,M);ntt(buf2,M);for(inti=0;i<M;i+=8){__m256ia=_mm256_loadu_si256((__m256i*)(ntt_inner::_buf1+i));__m256ib=_mm256_loadu_si256((__m256i*)(ntt_inner::_buf2+i));__m256ic=montgomery_mul_256(a,b,r,m1);_mm256_storeu_si256((__m256i*)(ntt_inner::_buf1+i),c);}intt(buf1,M,false);const__m256iINVM=_mm256_set1_epi32((mint(M).inverse()).a);for(inti=0;i<l;i+=8){__m256ia=_mm256_loadu_si256((__m256i*)(ntt_inner::_buf1+i));__m256ib=montgomery_mul_256(a,INVM,r,m1);__m256ic=my256_mulhi_epu32(my256_mullo_epu32(b,r),m1);__m256id=_mm256_and_si256(_mm256_cmpgt_epi32(c,m0),m1);__m256ie=_mm256_sub_epi32(d,c);_mm256_storeu_si256((__m256i*)(ntt_inner::_buf1+i),e);}}voidntt(vector<mint>&a){intM=(int)a.size();for(inti=0;i<M;i++)buf1[i].a=a[i].a;ntt(buf1,M);for(inti=0;i<M;i++)a[i].a=buf1[i].a;}voidintt(vector<mint>&a){intM=(int)a.size();for(inti=0;i<M;i++)buf1[i].a=a[i].a;intt(buf1,M,true);for(inti=0;i<M;i++)a[i].a=buf1[i].a;}vector<mint>multiply(constvector<mint>&a,constvector<mint>&b){if(a.size()==0&&b.size()==0)returnvector<mint>{};intl=a.size()+b.size()-1;if(min<int>(a.size(),b.size())<=40){vector<mint>s(l);for(inti=0;i<(int)a.size();++i)for(intj=0;j<(int)b.size();++j)s[i+j]+=a[i]*b[j];returns;}assert(l<=ntt_inner::SZ_FFT_BUF);intM=4;while(M<l)M<<=1;for(inti=0;i<(int)a.size();++i)buf1[i].a=a[i].a;for(inti=(int)a.size();i<M;++i)buf1[i].a=0;for(inti=0;i<(int)b.size();++i)buf2[i].a=b[i].a;for(inti=(int)b.size();i<M;++i)buf2[i].a=0;ntt(buf1,M);ntt(buf2,M);for(inti=0;i<M;++i)buf1[i].a=mint::reduce(uint64_t(buf1[i].a)*buf2[i].a);intt(buf1,M,false);vector<mint>s(l);mintinvm=mint(M).inverse();for(inti=0;i<l;++i)s[i]=buf1[i]*invm;returns;}voidntt_doubling(vector<mint>&a){intM=(int)a.size();for(inti=0;i<M;i++)buf1[i].a=a[i].a;intt(buf1,M);mintr=1,zeta=mint(pr).pow((mint::get_mod()-1)/(M<<1));for(inti=0;i<M;i++)buf1[i]*=r,r*=zeta;ntt(buf1,M);a.resize(2*M);for(inti=0;i<M;i++)a[M+i].a=buf1[i].a;}};#line 7 "verify/verify-yosupo-ntt/yosupo-convolution-ntt-avx2.test.cpp"
constexprintMOD=998244353;usingmint=LazyMontgomeryModInt<MOD>;usingvm=vector<mint>;usingnamespaceNyaan;voidNyaan::solve(){NTT<mint>ntt;ini(N,M);vma(N),b(M);in(a,b);autoc=ntt.multiply(a,b);out(c);}