Personal Video Database
English => Development => Topic started by: gibbon on December 16, 2011, 05:58:54 pm
-
Hello,
thanks for the great app and especially it's Unicode support.
I've started to write an import script for japanese site (EUC-JP encoding, CP:20932) and I've noticed very strange thing.
Raw page saved automatically to file 'page.html' is OK. But when I process the page by the script (even when I output the HTML string right at the script start) it becomes broken at some places. For example:
<a href="/digital/videoa/-/detail/=/cid=41djk012/">猥らなほどに悩ましい 古都ひかる</a></p>
becomes
<a href="/digital/videoa/-/detail/=/cid=41djk012/">猥らなほどに悩ましい 古都ひか・E/a></p>
Which besides of changing the text, destroys the whole HTML structure.
Other examples out of many more:
~ -> ?
奥さん! -> ・E気鵝・
女 2 -> ・E2
For many hours I've been trying different codepages of the script (20932, autodetect, UTF8), but garbled text or errors like this one are the only results.
Has anyone had similar experience? Can't there be a bug in the script parser? Is there any way around?
Thank you very much in advance.
-
I think it would be easier if you provide the script itself for test.
-
Here is the testing version of the script. It doesn't even generate results, just writes the test files.
I used Adult DVD Empire script as a template.
It's possible that "analyze=V1EBCFcEUAc_" part of the SEARCH_STR is generated different for each IP/user. So if the search doesn't work, please go to the website and replace it with yours unique one.
Thank you again. :)
//This version of script is for use with PVD versions 0.9.9.16 and above!!!
(*
Additional types and functions that can be used in scripts:
//Types
TWIDEARRAY : array of String
//Field functions
procedure AddSearchResult(Title1, Title2, Year, URL, PreviewURL : String)
procedure AddFieldValue(AField: Integer; AValue : String)
procedure AddMoviePerson(Name, TransName, Role, URL : String; AType : Byte)
procedure AddPersonMovie(Title, OrigTitle, Role, Year, URL : String; AType : Byte)
procedure AddAward(Event, Award, Category, Recipient, Year: String; const Won : Boolean)
procedure AddConnection(Title, OrigTitle, Category, URL, Year: String)
procedure AddEpisode(Title, OrigTitle, Description, URL, Year, Season, Episode : String)
//String functions
function Pos(Substr : String; Str: String): Integer
function PosFrom(const SubStr, Str : String; FromIndex : Integer) : Integer
function LastPos(const SubStr, Str : String) : Integer
function PrevPos(const SubStr, Str : String; APos : Integer) : Integer
function RemoveTags(AText : String; doLineBreaks : Boolean) : String
function ExplodeString(AText : String; var Items : TWideArray; Delimiters : String) : Integer
function Copy(S: String; Index, Count: Integer): String
procedure Delete(var S: String; Index, Count: Integer)
procedure Insert(Source: String; var Dest: String; Index: Integer)
function Length(S: String): Integer
function Trim(S: String): String
function CompareText(S1, S2: String): Integer
function CompareStr(S1, S2: String): Integer
function UpperCase(S: String): String
function LowerCase(S: String): String
function StringReplace(S, OldPattern, NewPattern: String; ReplaceAll : Boolean; IgnoreCase : Boolean; WholeWord: Boolean): String
function StrToInt(const S: String): Integer
function IntToStr(const Value: Integer): String
function StrToFloat(const S: String): Extended
function FloatToStr(const Value: Extended): String
function HTMLValues(const HTML : String; ABegin, AEnd, ItemBegin, ItemEnd : String; ValDelim : String; var Pos : Integer) : String
function HTMLValues2(const HTML : String; ABegin, AEnd, ItemBegin, ItemEnd : String; ValDelim : String; var Pos : Integer) : String
function TextBetween(const HTML : String; ABegin, AEnd : String; doLineBreaks : Boolean; var Pos : Integer) : String
function HTMLToText(const HTML : String) : String
procedure ShowMessage(const Msg, Head : String)
*)
const
pauseBeforeLoad = 0; // Pause before loading (in millisecond)
//Some useful constants
const
//Script types
stMovies = 0;
stPeople = 1;
stPoster = 2;
//Script modes
smSearch = 0;
smNormal = 1;
smPoster = 2;
//Parse results
prError = 0;
prFinished = 1;
prList = 2;
prListImage = 3;
prDownload = 4;
//Movie fields
mfURL = 0;
mfTitle = 1;
mfOrigTitle = 2;
mfAka = 3;
mfYear = 4;
mfGenre = 5;
mfCategory = 6;
mfCountry = 7;
mfStudio = 8;
mfMPAA = 9;
mfRating = 10;
mfTags = 11;
mfTagline = 12;
mfDescription = 13;
mfDuration = 14;
mfFeatures = 15;
//People fields
pfURL = 0;
pfName = 1;
pfTransName = 2;
pfAltNames = 3;
pfBirthday = 4;
pfBirthplace = 5;
pfGenre = 6;
pfBio = 7;
pfDeathDate = 8;
//Credits types
ctActors = 0;
ctDirectors = 1;
ctWriters = 2;
ctComposers = 3;
ctProducers = 4;
//Script data
SCRIPT_VERSION = '0.0.0.4';
SCRIPT_NAME = 'DMM.co.jp';
SCRIPT_DESC = '[EN] Get movie information DMM.co.jp';
SCRIPT_LANG = $11; //Japanese //Tested both English & Japanese
SCRIPT_TYPE = stMovies;
BASE_URL = 'http://www.dmm.co.jp';
RATING_NAME = 'DMM';
SEARCH_STR = 'http://www.dmm.co.jp/search/=/searchstr=%s/analyze=V1EBCFcEUAc_/limit=30/sort=rank_asc/view=text/num=1/';
CODE_PAGE = 20932; // Tested: 0, 33722, 51932, 20932, 65001
//Global variables
var
Mode : Byte;
PosterURL : String;
//Functions
function GetScriptVersion : String;
begin
Result := SCRIPT_VERSION;
end;
function GetScriptName : String;
begin
Result := SCRIPT_NAME;
end;
function GetScriptDesc : String;
begin
Result := SCRIPT_DESC;
end;
function GetRatingName : String;
begin
Result := RATING_NAME;
end;
function GetScriptLang: Cardinal;
begin
Result := SCRIPT_LANG;
end;
function GetCodePage : Cardinal;
begin
Result := CODE_PAGE;
end;
function GetBaseURL : AnsiString;
begin
Result := BASE_URL;
end;
function GetDownloadURL : AnsiString;
begin
if PosterURL = '' then
Result := SEARCH_STR
else
Result := PosterURL;
end;
function GetScriptType : Byte;
begin
Result := SCRIPT_TYPE;
end;
function GetCurrentMode : Byte;
begin
Result := Mode;
end;
procedure FindPoster(HTML : String);
var
curPos, EndPos : Integer;
begin
// not yet implemented
end;
// ****************** ParseMovie
procedure ParseMovie(MovieURL : String; HTML : String);
var
curPos, EndPos, P, P2, L : Integer;
Tmp, URL, Name : String;
begin
AddFieldValue(mfURL, MovieURL);
// not yet implemented
end;
// ****************** ParseSearchResults
procedure ParseSearchResults(HTML : String);
var
curPos, EndPos, P : Integer;
Title, URL, Tabulka : String;
begin
curPos := PosFrom('<table summary="', HTML, 1); //beginning of results table
EndPos := PosFrom('</table>', HTML, curPos); //end of results table
Tabulka := Copy(HTML, curPos, EndPos - curPos);
StringToFile('wholepage.htm', HTML, false ,false); //debug - output test
StringToFile('tabulka.htm', Tabulka, false ,false); //debug - output test
// LogMessage('CodePage:' + IntToStr(GetCodePage));
end;
// ****************** ParsePage
function ParsePage(HTML : String; URL : AnsiString) : Cardinal;
begin
Wait (pauseBeforeLoad);
//HTML := ConvertEncoding(HTML, 20932);
StringToFile('start.htm', HTML, false ,false); //debug - output test
if Pos('<div class="othertxt">', HTML) > 0 then begin
LogMessage('Nothing Found!'); //debug
Result := prError
end else
if Pos('<th scope="col">', HTML) > 0 then begin
LogMessage('Calling ParseSearchResults!'); //debug
ParseSearchResults(HTML);
Result := prList;
end else
if Pos('<h1 id="title"', HTML) > 0 then begin
LogMessage('Product page!'); //debug
FindPoster(HTML);
Mode := smPoster;
if PosterURL <> '' then
Result := prDownload
else
Result := prFinished;
end else begin
ParseMovie(URL, HTML);
Mode := smNormal;
if PosterURL = '' then
Result := prFinished
else
Result := prDownload;
end;
end;
begin
Mode := smSearch;
end.
-
Are you sure that the actual data in memory is like this. I have investigated the problem and it looks correct in memory, but gets scrambled when saving to file (probably because the encoding information is lost).
Try to parse a page and return some data as result to PVD. Would the fields be filled with correct strings?
-
Are you sure that the actual data in memory is like this. I have investigated the problem and it looks correct in memory, but gets scrambled when saving to file (probably because the encoding information is lost).
Try to parse a page and return some data as result to PVD. Would the fields be filled with correct strings?
Unfortunately they're broken as well. That's how I noticed. File output was just for debugging purpose.
Please find the attached picture with PVD 'Select Movie' window. The checked lines have some broken characters.
This is the log. You can see the encoding errors broke the parsing logic:
(12/27/2011 5:07:47 PM) UpdateToolbar: 6
(12/27/2011 5:07:47 PM) UpdateToolbar: 7
(12/27/2011 5:07:56 PM) Compiling script: dmm_0_0_4.psf
(12/27/2011 5:07:56 PM) Script compiled successfully: dmm_0_0_4.psf
[Hint] (381:2): Variable 'P' never used
(12/27/2011 5:07:56 PM) Executing script binary
(12/27/2011 5:07:56 PM) Logging in...
(12/27/2011 5:07:56 PM) Searching movie information for: djk
(12/27/2011 5:07:56 PM) GET: http://www.dmm.co.jp/search/=/searchstr=djk/analyze=V1EBCFcEUAc_/limit=30/sort=rank_asc/view=text/num=1/
(12/27/2011 5:07:58 PM) start!
(12/27/2011 5:07:58 PM) ParseSearchResults! !
(12/27/2011 5:07:58 PM) CodePage:0
(12/27/2011 5:07:58 PM) URL: http://www.dmm.co.jp/digital/m_full/-/detail/=/cid=djk11/
(12/27/2011 5:07:58 PM) Title: 少女の道・E11
(12/27/2011 5:07:58 PM) URL: http://www.dmm.co.jp/digital/m_full/-/detail/=/cid=djk18/
(12/27/2011 5:07:58 PM) Title: 少女の道・E18
(12/27/2011 5:07:58 PM) URL: http://www.dmm.co.jp/digital/m_full/-/detail/=/cid=djk19/
(12/27/2011 5:07:58 PM) Title: 少女の道・E19
(12/27/2011 5:07:58 PM) URL: http://www.dmm.co.jp/digital/m_full/-/detail/=/cid=djk07/
(12/27/2011 5:07:58 PM) Title: 少女の道・E7
(12/27/2011 5:07:58 PM) URL: http://www.dmm.co.jp/digital/m_full/-/detail/=/cid=djk14/
(12/27/2011 5:07:58 PM) Title: 少女の道・E14
(12/27/2011 5:07:58 PM) URL: http://www.dmm.co.jp/digital/m_full/-/detail/=/cid=djk10/
(12/27/2011 5:07:58 PM) Title: 少女の道・E10
(12/27/2011 5:07:58 PM) URL: http://www.dmm.co.jp/digital/m_full/-/detail/=/cid=djk09/
(12/27/2011 5:07:58 PM) Title: 少女の道・E9
(12/27/2011 5:07:58 PM) URL: http://www.dmm.co.jp/digital/videoa/-/detail/=/cid=51djk009/
(12/27/2011 5:07:58 PM) Title: 猥E熟・E8 岡・EE/a></p>
<p class="status">
<span class="ico-st-monopoly"><span>独・E/span></span>
<!--/status--></p>
</td>
<td><a href="/search/=/searchstr=djk/limit=30/n1=FgRCTw9VBA4GAVhfWkIHWw__/n2=Aw1fVhQKX1ZRAlhMUlo5QQgBU1lR/sort=rank_asc/view=text/num=1/">ビデオ
(12/27/2011 5:07:58 PM) URL: http://www.dmm.co.jp/monthly/mania/-/detail/=/cid=51djk009/
(12/27/2011 5:07:58 PM) Title: 猥E熟・E8 岡・EE/a>
</p></td>
<td><a href="/search/=/searchstr=djk/limit=30/n1=FgRCTw9VBA4GCF5WR14KTg__/n2=Aw1fVhQKX19XC0VQX085WgALX1c_/sort=rank_asc/view=text/num=1/">マニア
(12/27/2011 5:07:59 PM) URL: http://www.dmm.co.jp/digital/videoa/-/detail/=/cid=djks01/
(12/27/2011 5:07:59 PM) Title: 渋谷女子校生 少女の道・E8時間
(12/27/2011 5:07:59 PM) URL: http://www.dmm.co.jp/digital/videoa/-/detail/=/cid=h_275tdjk00016/
(12/27/2011 5:07:59 PM) Title: 宅配露出 2
(12/27/2011 5:07:59 PM) URL: http://www.dmm.co.jp/digital/videoa/-/detail/=/cid=h_275tdjk00004/
(12/27/2011 5:07:59 PM) Title: 宅配露出
(12/27/2011 5:07:59 PM) URL: http://www.dmm.co.jp/digital/videoa/-/detail/=/cid=h_275tdjk00015/
(12/27/2011 5:07:59 PM) Title: ・・E灰鷯・憤E中・E弧穏泪Eぅ芛/a></p>
</td>
<td><a href="/search/=/searchstr=djk/limit=30/n1=FgRCTw9VBA4GAVhfWkIHWw__/n2=Aw1fVhQKX1ZRAlhMUlo5QQgBU1lR/sort=rank_asc/view=text/num=1/">ビデオ
(12/27/2011 5:07:59 PM) URL: http://www.dmm.co.jp/digital/videoa/-/detail/=/cid=h_157djk010/
(12/27/2011 5:07:59 PM) Title: 女子校生見せつけオナニー
(12/27/2011 5:07:59 PM) URL: http://www.dmm.co.jp/monthly/mania/-/detail/=/cid=h_157djk010/
(12/27/2011 5:07:59 PM) Title: 女子校生見せつけオナニー
(12/27/2011 5:07:59 PM) URL: http://www.dmm.co.jp/digital/videoa/-/detail/=/cid=41djk012/
(12/27/2011 5:07:59 PM) Title: 猥らなほどに悩ましい 古都ひか・E/a></p>
</td>
<td><a href="/search/=/searchstr=djk/limit=30/n1=FgRCTw9VBA4GAVhfWkIHWw__/n2=Aw1fVhQKX1ZRAlhMUlo5QQgBU1lR/sort=rank_asc/view=text/num=1/">ビデオ
(12/27/2011 5:07:59 PM) URL: http://www.dmm.co.jp/monthly/hmp/-/detail/=/cid=41djk012/
(12/27/2011 5:07:59 PM) Title: 猥らなほどに悩ましい 古都ひか・E/a>
</p></td>
<td><a href="/search/=/searchstr=djk/limit=30/n1=FgRCTw9VBA4GCF5WR14KTg__/n2=Aw1fVhQKX19XC0VQX085XwwV/sort=rank_asc/view=text/num=1/">h.m.p
(12/27/2011 5:07:59 PM) URL: http://www.dmm.co.jp/digital/videoa/-/detail/=/cid=djks03/
(12/27/2011 5:07:59 PM) Title: 渋谷女子校生 少女の道・E8時間 3
(12/27/2011 5:07:59 PM) URL: http://www.dmm.co.jp/digital/videoa/-/detail/=/cid=41hodv00211/
(12/27/2011 5:07:59 PM) Title: ・Eぅ廚気E拭・古都ひか・E/a></p>
</td>
<td><a href="/search/=/searchstr=djk/limit=30/n1=FgRCTw9VBA4GAVhfWkIHWw__/n2=Aw1fVhQKX1ZRAlhMUlo5QQgBU1lR/sort=rank_asc/view=text/num=1/">ビデオ
(12/27/2011 5:07:59 PM) URL: http://www.dmm.co.jp/monthly/hmp/-/detail/=/cid=41hodv00211/
(12/27/2011 5:07:59 PM) Title: ・Eぅ廚気E拭・古都ひか・E/a>
</p></td>
<td><a href="/search/=/searchstr=djk/limit=30/n1=FgRCTw9VBA4GCF5WR14KTg__/n2=Aw1fVhQKX19XC0VQX085XwwV/sort=rank_asc/view=text/num=1/">h.m.p
(12/27/2011 5:07:59 PM) URL: http://www.dmm.co.jp/digital/videoa/-/detail/=/cid=djks02/
(12/27/2011 5:07:59 PM) Title: 渋谷女子校生 少女の道・E8時間 2
(12/27/2011 5:07:59 PM) URL: http://www.dmm.co.jp/mono/dvd/-/detail/=/cid=djks02/
(12/27/2011 5:07:59 PM) Title: 渋谷女子校生 少女の道・E8時間 2
(12/27/2011 5:07:59 PM) URL: http://www.dmm.co.jp/mono/dvd/-/detail/=/cid=djks01/
(12/27/2011 5:07:59 PM) Title: 渋谷女子校生 少女の道・E8時間
(12/27/2011 5:07:59 PM) URL: http://www.dmm.co.jp/mono/dvd/-/detail/=/cid=h_275tdjk16/
(12/27/2011 5:07:59 PM) Title: 宅配露出 2
(12/27/2011 5:07:59 PM) URL: http://www.dmm.co.jp/mono/dvd/-/detail/=/cid=10dnsh001/
(12/27/2011 5:07:59 PM) Title: パンチラ☆ちら見電車 〜チラチラ片道切符〜
(12/27/2011 5:07:59 PM) URL: http://www.dmm.co.jp/monthly/mania/-/detail/=/cid=29djkc04/
(12/27/2011 5:07:59 PM) Title: 痴女×M・E2
(12/27/2011 5:07:59 PM) URL: http://www.dmm.co.jp/monthly/mania/-/detail/=/cid=29djkc01/
(12/27/2011 5:07:59 PM) Title: 痴女×M・E/a>
</p></td>
<td><a href="/search/=/searchstr=djk/limit=30/n1=FgRCTw9VBA4GCF5WR14KTg__/n2=Aw1fVhQKX19XC0VQX085WgALX1c_/sort=rank_asc/view=text/num=1/">マニア
(12/27/2011 5:07:59 PM) URL: http://www.dmm.co.jp/monthly/mania/-/detail/=/cid=29djkb01/
(12/27/2011 5:07:59 PM) Title: 爆乳痴態娘 1 水沢ダイア
(12/27/2011 5:07:59 PM) URL: http://www.dmm.co.jp/monthly/mania/-/detail/=/cid=29djkb02/
(12/27/2011 5:07:59 PM) Title: 爆乳痴態娘 2 真咲菜々
(12/27/2011 5:07:59 PM) URL: http://www.dmm.co.jp/monthly/mania/-/detail/=/cid=29djkj01/
(12/27/2011 5:07:59 PM) Title: 痴的女教師
This is the parsed results page:
http://www.dmm.co.jp/search/=/searchstr=djk/analyze=V1EBCFcEUAc_/limit=30/sort=rank_asc/view=text/num=1/
And finally this is the code of :
procedure ParseSearchResults(HTML : String);
var
curPos, EndPos, P : Integer;
Title, URL : String;
begin
EndPos := 1;
LogMessage('CodePage:' + IntToStr(GetCodePage));
curPos := PosFrom('<table summary="', HTML, EndPos);
EndPos := curPos + Length('<table summary="');
while curPos > 0 do begin
//EndPos := curPos + Length('<table summary="');
curPos := PosFrom('<td><p class="ttl">', HTML, EndPos);
if curPos > 0 then begin
curPos := PosFrom('<a href="', HTML, EndPos);
if curPos > 0 then begin
curPos := curPos + Length('<a href="');
EndPos := PosFrom('">', HTML, curPos);
URL := BASE_URL + Copy(HTML, curPos, EndPos - curPos);
LogMessage('URL: ' + URL); //debug
curPos := EndPos + 2;
EndPos := PosFrom('</a>', HTML, curPos);
Title := Copy(HTML, curPos, EndPos - curPos);
EndPos := PosFrom('<td>', HTML, EndPos) + 2;
EndPos := PosFrom('<td>', HTML, EndPos) + 2;
end;
end;
LogMessage('Title: ' + Title); //debug
AddSearchResult(Title, '', '', URL, '');
curPos := PosFrom('<p class="ttl"', HTML, EndPos);
end;
end;
Thank you for further investigating of the problem.
[attachment deleted by admin]
-
Could you, please, check if the problem persists with the current beta version? (1.0.2.2)
-
Could you, please, check if the problem persists with the current beta version? (1.0.2.2)
Yes, it does. :( In 1.0.2.2 beta exactly the same problem.