01048: search in attachment (PDF, DOC ...)

Summary: search in attachment (PDF, DOC ...)
Created: 2008-09-16 13:24
Status: Open
Category: Feature
From: eggice?
Assigned:
Priority: 33
Version: 2.2.9
OS: Linux/Lighttpd/5

Description: need an extention to seach attachment, such as PDF. So that I can use PmWiki as my knowledge management database.

I have so many PDFs. I want to index them with PmWiki.


Here is some piece of code that may help you out. CarlosAB January 03, 2009, at 04:22 PM

function pdf2string($sourcefile) {

    $fp = fopen($sourcefile, 'rb'); 
    $content = fread($fp, filesize($sourcefile)); 
    fclose($fp); 

    $searchstart = 'stream'; 
    $searchend = 'endstream'; 
    $pdfText = ''; 
    $pos = 0; 
    $pos2 = 0; 
    $startpos = 0; 

    while ($pos !== false && $pos2 !== false) { 

        $pos = strpos($content, $searchstart, $startpos); 
        $pos2 = strpos($content, $searchend, $startpos + 1); 

        if ($pos !== false && $pos2 !== false){ 

            if ($content[$pos] == 0x0d && $content[$pos + 1] == 0x0a) { 
                $pos += 2; 
            } else if ($content[$pos] == 0x0a) { 
                $pos++; 
            } 

            if ($content[$pos2 - 2] == 0x0d && $content[$pos2 - 1] == 0x0a) { 
                $pos2 -= 2; 
            } else if ($content[$pos2 - 1] == 0x0a) { 
                $pos2--; 
            } 

            $textsection = substr( 
                $content, 
                $pos + strlen($searchstart) + 2, 
                $pos2 - $pos - strlen($searchstart) - 1 
            ); 
            $data = @gzuncompress($textsection); 
            $pdfText .= pdfExtractText($data); 
            $startpos = $pos2 + strlen($searchend) - 1; 

        } 
    } 

    return preg_replace('/(\s)+/', ' ', $pdfText); 

}

function pdfExtractText($psData){

    if (!is_string($psData)) { 
        return ''; 
    } 

    $text = ''; 

    // Handle brackets in the text stream that could be mistaken for 
    // the end of a text field. I'm sure you can do this as part of the 
    // regular expression, but my skills aren't good enough yet. 
    $psData = str_replace('\)', '##ENDBRACKET##', $psData); 
    $psData = str_replace('\]', '##ENDSBRACKET##', $psData); 

    preg_match_all( 
        '/(T[wdcm*])[\s]*(\[([^\]]*)\]|\(([^\)]*)\))[\s]*Tj/si', 
        $psData, 
        $matches 
    ); 
    for ($i = 0; $i < sizeof($matches[0]); $i++) { 
        if ($matches[3][$i] != '') { 
            // Run another match over the contents. 
            preg_match_all('/\(([^)]*)\)/si', $matches[3][$i], $subMatches); 
            foreach ($subMatches[1] as $subMatch) { 
                $text .= $subMatch; 
            } 
        } else if ($matches[4][$i] != '') { 
            $text .= ($matches[1][$i] == 'Tc' ? ' ' : '') . $matches[4][$i]; 
        } 
    } 

    // Translate special characters and put back brackets. 
    $trans = array( 
        '...'                => '…', 
        '\205'                => '…', 
        '\221'                => chr(145), 
        '\222'                => chr(146), 
        '\223'                => chr(147), 
        '\224'                => chr(148), 
        '\226'                => '-', 
        '\267'                => '•', 
        '\('                => '(', 
        '\['                => '[', 
        '##ENDBRACKET##'    => ')', 
        '##ENDSBRACKET##'    => ']', 
        chr(133)            => '-', 
        chr(141)            => chr(147), 
        chr(142)            => chr(148), 
        chr(143)            => chr(145), 
        chr(144)            => chr(146), 
    ); 
    $text = strtr($text, $trans); 

    return $text; 

}