卿大爷的多个女友 后缀数组、最长连续重复子串
Source
2016 UESTC Training for Search Algorithm & String
My Solution
题意:给一个字符串s,求最长连续重复子串的长度,如cdaabababcc的最长连续重复子串为ababab,长度为6。(由ab连续重复拼接而成)
后缀数组、最长连续重复子串
倍增算法O(nlogn)的跑出后缀数组sa[],然后再O(n)的跑出height数组,
height[i] = LCP(i - 1, i); //LCP(u, v) = max{i | u[1..i] == v[1..i] }
LCP(i, j) = min{LCP(k - 1, k) | 1 < k <= j}
令int lcp = LCP(i, j); diat = abs(sa[i - 1] - sa[j]);
则当lcp >= diat 时, 有连续重复子串出现,ans = max(ans, diat * (lcp / diat + 1);
比如
s = abababc
1 abababc
2 ababc
lcp = 4, diat = 2;
s1 = s1[1..diat] + s2;
所以s1里有 (lcp / diat + 1)个s1[1..diat]
abababc
ababc
由于lcp == 4, diat = = 2, 故s2有的ab对应s2[12] == s1[12],但由于s1 == suffix(7), s2 = suffix(5),所以必有 s2[12] == s1[34],
又由于lcp == 4,所以s1[34] == s2[34],从而又由s1 == suffix(7), s2 = suffix(5),得 s1[56] == s2[34]
故有 (lcp / diat + 1) 个 s1[1..diat],即3个ab
如果对于ababa可有
aba
ababa
lcp = 3, diat = 2;
故有 lcp / diat + 1 个diat,即2个ab。
此外由于sa[]中字符串隔的越远一般lcp越小,所以一般只计算i附近的几个j即可,if(lcp < ans) break;
最开的时候ans == 0, lcp可能也是0,但lcp一旦为0,接下来也必定还是0,故lcp为0时直接break。
复杂度 O(nlogn)
#include
#include
#include
#include
#include
using namespace std;
typedef long long LL;
const int maxn = 1e5 + 8;
int str[maxn], sa[maxn], height[maxn];
string s;
inline void radix(int *str, int *a, int *b, const int &n, const int &m)
{
static int cnt[maxn];
memset(cnt, 0, sizeof cnt);
for(int i = 0; i < n; i++) ++cnt[str[a[i]]];
for(int i = 1; i <= m; i++) cnt[i] += cnt[i-1]; for(int i = n-1; i >= 0; i--) b[--cnt[str[a[i]]]] = a[i];
}
inline void suffix_array(int *str, int *sa, const int &n, const int &m)
{
static int _rank[maxn], a[maxn], b[maxn];
for(int i = 0; i < n; i++) _rank[i] = i;
radix(str, _rank, sa, n, m);
_rank[sa[0]] = 0;
for(int i = 1; i < n; i++){
_rank[sa[i]] = _rank[sa[i-1]] + (str[sa[i]] != str[sa[i-1]]);
}
for(int i = 0; (i<<i) < n; i++){
for(int j = 0; j < n; j++){
a[j] = _rank[j] + 1;
b[j] = j + (1<<i) >= n ? 0 : _rank[j + (1 << i)] + 1;
sa[j] = j;
}
radix(b, sa, _rank, n, n);
radix(a, _rank, sa, n, n);
_rank[sa[0]] = 0;
for(int j = 1; j < n; j++){
_rank[sa[j]] = _rank[sa[j-1]] + (a[sa[j-1]] != a[sa[j]] ||
b[sa[j-1]] != b[sa[j]]);
}
}
}
//h即为height数组
inline void calc_height(int *str, int *sa, int *h, const int &n)
{
static int _rank[maxn];
int k = 0;
h[0] = 0;
for(int i = 0; i < n; i++) _rank[sa[i]] = i;
for(int i = 0; i < n; i++){ k = k == 0 ? 0 : k - 1; if(_rank[i] != 0){ while(str[i + k] == str[sa[_rank[i]-1] + k]) k++; } h[_rank[i]] = k; } } int main() { #ifdef LOCAL freopen("a.txt", "r", stdin); //freopen("a.out", "w", stdout); int T = 2; while(T--){ #endif // LOCAL ios::sync_with_stdio(false); cin.tie(0); int n; cin >> s;
n = s.size();
for(int i = 0; i < n; i++) str[i] = (s[i] - 'a') + 1;
suffix_array(str, sa, n, 26);
calc_height(str, sa, height, n);
/*
for(int i = 0; i < n; i++){
for(int j = sa[i]; j < n; j++){
cout << s[j];
}
cout << endl;
}
*/
int lcp, ans = 0, diat;
for(int i = 1; i < n; i++){
lcp = height[i];
//cout << i << " : " << lcp << endl; diat = abs(sa[i-1] - sa[i]); if(lcp >= diat) ans = max(ans, diat * (lcp / diat + 1));
for(int j = i + 1; j < n; j++){
lcp = min(lcp, height[j]);
if(lcp == 0) break;
if(lcp * 2 < ans) break; //i,j相差越远lcp就越小,其实只要距离i最近的几个j就好了 diat = abs(sa[i-1] - sa[j]); if(lcp >= diat) ans = max(ans, diat * (lcp / diat + 1));
}
}
cout << ans << endl;
#ifdef LOCAL
cout << endl;
}
#endif // LOCAL
return 0;
}
非特殊说明,本博所有文章均为博主原创,未经许可不得转载。
https://www.prolightsfxjh.com/
Thank you!
------from ProLightsfx
非特殊说明,本博所有文章均为博主原创,未经许可不得转载。
如经许可后转载,请注明出处:https://prolightsfxjh.com/article/uestc-1384/
共有 0 条评论